huggingface
diff --git a/‎optimum/executorch/modeling.py‎
Lines changed: 28 additions & 23 deletions b/‎optimum/executorch/modeling.py‎
Lines changed: 28 additions & 23 deletions
diff --git a/‎optimum/exporters/executorch/integrations.py‎
Lines changed: 65 additions & 15 deletions b/‎optimum/exporters/executorch/integrations.py‎
Lines changed: 65 additions & 15 deletions
diff --git a/‎optimum/exporters/executorch/tasks/multimodal_text_to_text.py‎
Lines changed: 17 additions & 11 deletions b/‎optimum/exporters/executorch/tasks/multimodal_text_to_text.py‎
Lines changed: 17 additions & 11 deletions
diff --git a/‎optimum/exporters/executorch/utils.py‎
Lines changed: 24 additions & 0 deletions b/‎optimum/exporters/executorch/utils.py‎
Lines changed: 24 additions & 0 deletions
@@ -45,7 +45,7 @@
 
 from ..exporters import TasksManager
 from ..exporters.executorch import main_export
-from ..exporters.executorch.utils import verify_eos_tokens_in_pretrained_tokenizer
+from ..exporters.executorch.utils import apply_chat_template_with_fallback, verify_eos_tokens_in_pretrained_tokenizer
 from ..modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
 from ..utils.file_utils import find_files_matching_pattern
 from .stats import Stats
@@ -1167,8 +1167,9 @@ class ExecuTorchModelForMultiModalToText(ExecuTorchModelBase):
             Size of the model vocabulary.
     """
 
-    # Using general `AutoModel` since it usually routes to the correct model variant and there is no
-    # auto model class that captures both audio and image.
+    # Using `AutoModel` since there is no auto model class that captures both audio and image.
+    # This is not too important since it's just used for automatically inferring the
+    # task type. For MultiModal, we should always be specifying the task type anyways.
     auto_model_class = AutoModel
 
     def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
@@ -1180,6 +1181,7 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
                     f"Exported .pte file needs to contain the following required methods: {required_methods}"
                 )
 
+        # Multimodal-related metadata.
         self.encoder_name = None
         for method_name in self.model.method_names():
             if method_name == "audio_encoder":
@@ -1192,38 +1194,28 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
             raise ValueError(
                 'Exported .pte file needs to contain either an an "audio_encoder" or a "vision_encoder" in its methods.'
             )
+        self.modality = self.model.run_method("modality")[0]
 
+        # Decoder-related metadata.
         metadata = self.model.method_names()
-        if "use_kv_cache" in metadata:
-            self.use_kv_cache = self.model.run_method("use_kv_cache")[0]
         if "get_max_seq_len" in metadata:
             self.max_cache_size = self.model.run_method("get_max_seq_len")[0]
-        if "get_max_batch_size" in metadata:
-            self.max_batch_size = self.model.run_method("get_max_batch_size")[0]
-        if "get_dtype" in metadata:
-            self.dtype = self.model.run_method("get_dtype")[0]
         if "get_bos_id" in metadata:
             self.bos_token_id = self.model.run_method("get_bos_id")[0]
         if "get_eos_id" in metadata:
             self.eos_token_id = self.model.run_method("get_eos_id")[0]
-        if "get_vocab_size" in metadata:
-            self.vocab_size = self.model.run_method("get_vocab_size")[0]
-        if "max_hidden_seq_length" in metadata:
-            self.max_hidden_seq_length = self.model.run_method("max_hidden_seq_length")[0]
-        if "decoder_start_token_id" in metadata:
-            self.decoder_start_token_id = self.model.run_method("decoder_start_token_id")[0]
 
     def forward(
         self,
         input_ids: torch.Tensor,
         cache_position: torch.Tensor,
-        input_features: Optional[torch.Tensor] = None,
+        multimodal_features: Optional[torch.Tensor] = None,
     ):
         token_embeddings = self.model.run_method("token_embedding", (input_ids,))[0]
-        if input_features is not None:
+        if multimodal_features is not None:
             encoder_embeddings = self.model.run_method(
                 self.encoder_name,
-                (input_features,),
+                (multimodal_features,),
             )[0]
             encoder_token_mask = input_ids == self.encoder_token_id
             token_embeddings[encoder_token_mask] = encoder_embeddings
@@ -1242,7 +1234,7 @@ def generate(
         echo: bool = False,
         pos_base: int = 0,
         max_seq_len: Optional[int] = None,
-        input_features: Optional[torch.Tensor] = None,
+        multimodal_features: Optional[torch.Tensor] = None,
     ) -> List[int]:
         self.device = torch.device("cpu")
         if max_seq_len is None:
@@ -1259,7 +1251,7 @@ def generate(
         logits = self.forward(
             input_ids=prompt_tokens,
             cache_position=torch.arange(prompt_tokens.size(1), dtype=torch.long, device=self.device),
-            input_features=input_features,
+            multimodal_features=multimodal_features,
         )
         self.stats.on_sampling_end()
         self.stats.on_prompt_eval_end()
@@ -1279,7 +1271,7 @@ def generate(
                     dtype=torch.long,
                     device=self.device,
                 ),
-                input_features=None,
+                multimodal_features=None,
             )
             self.stats.on_sampling_end()
             if not first_token_generated:
@@ -1337,13 +1329,26 @@ def text_generation(
         self.stats.reset()
         self.stats.on_inference_start()
 
-        inputs = processor.apply_chat_template(input_conversation)
+        inputs = apply_chat_template_with_fallback(
+            processor,
+            input_conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
         self.stats.on_token_encode_end()
         self.stats.set_num_prompt_tokens(len(inputs["input_ids"][0]))
 
+        multimodal_features = None
+        if self.modality == "vision":
+            multimodal_features = inputs.get("pixel_values", None)
+        elif self.modality == "audio":
+            multimodal_features = inputs.get("input_features", None)
         generated_tokens = self.generate(
             prompt_tokens=inputs["input_ids"],
-            input_features=inputs["input_features"],
+            multimodal_features=multimodal_features,
             echo=echo,
             max_seq_len=len(inputs["input_ids"][0]) + max_seq_len,
         )
 
@@ -33,7 +33,50 @@
 
 from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
 
-from .utils import save_config_to_constant_methods
+from .utils import apply_chat_template_with_fallback, save_config_to_constant_methods
+
+
+class VisionExportableModule(torch.nn.Module):
+    def __init__(self, model: torch.nn.Module):
+        super().__init__()
+        self.model = model
+
+    def prepare_export_inputs(self):
+        # 1. Get export inputs
+        model_id = self.model.config.name_or_path
+        processor = AutoProcessor.from_pretrained(model_id)
+        sample_conversation_with_image = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                ],
+            },
+        ]
+        processed_inputs = processor.apply_chat_template(
+            sample_conversation_with_image,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        if "pixel_values" not in processed_inputs:
+            raise ValueError(
+                f"Unable to obtain sample audio encoder inputs for export for {model_id} - the processor did not return formatted inputs with the 'pixel_values' key: {processed_inputs}"
+            )
+        export_inputs = processed_inputs["pixel_values"]
+
+        # 2. Get export dynamic shapes
+        dynamic_shapes = None  # No batching for now.
+
+        return export_inputs, dynamic_shapes
+
+    def forward(
+        self,
+        input_features: torch.FloatTensor,
+    ):
+        image_embeds = self.model.get_image_features(input_features)
+        return image_embeds.unsqueeze(0)
 
 
 class AudioExportableModule(torch.nn.Module):
@@ -56,7 +99,14 @@ def prepare_export_inputs(self):
                 ],
             }
         ]
-        processed_inputs = processor.apply_chat_template(sample_conversation_with_audio)
+        processed_inputs = apply_chat_template_with_fallback(
+            processor,
+            sample_conversation_with_audio,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
         if "input_features" not in processed_inputs:
             raise ValueError(
                 f"Unable to obtain sample audio encoder inputs for export for {model_id} - the processor did not return formatted inputs with the 'input_features' key: {processed_inputs}"
@@ -129,9 +179,13 @@ def __init__(
         self.processor_config = processor_config
         self.use_custom_kv_cache = use_custom_kv_cache
         self.use_custom_sdpa = use_custom_sdpa
-        modality_token_placeholder_id_kwargs = {f"{modality}_token_id": getattr(self.config, f"{modality}_token_id")}
+        additional_metadata_kwargs = {"modality": modality}
+        if modality == "audio":
+            additional_metadata_kwargs[f"{modality}_token_id"] = getattr(self.config, "audio_token_id")
+        elif modality == "vision":
+            additional_metadata_kwargs[f"{modality}_token_id"] = getattr(self.config, "image_token_id")
         self.metadata = save_config_to_constant_methods(
-            model.config.text_config, model.generation_config, processor_config, **modality_token_placeholder_id_kwargs
+            model.config.text_config, model.generation_config, processor_config, **additional_metadata_kwargs
         )
         logging.info(f"Metadata to be recorded in PTE: {self.metadata}")
 
@@ -148,9 +202,9 @@ def _prepare_text_embedding_export_inputs(self, max_seq_len: int):
         example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
 
         seq_len_dim = torch.export.Dim("seq_length_dim", max=max_seq_len)
-        dynamic_shapes = {
-            "input": {1: seq_len_dim},
-        }  # nn.embedding forward() args are here - https://github.com/pytorch/pytorch/blob/febf3c475e6fe369b41ef009f3598659a6df0911/torch/nn/modules/sparse.py#L15.
+        # Don't use named dynamic shapes since embedding modules can have diferent arg
+        # names for the input. e.g. nn.embedding vs embedding modules defined in Transformers.
+        dynamic_shapes = ({1: seq_len_dim},)
 
         return example_input_ids, dynamic_shapes
 
@@ -213,7 +267,7 @@ def export(
 
             # 1. Export text decoder.
             exportable_module = TorchExportableModuleForDecoderOnlyLM(
-                getattr(self.model, self.decoder_name),
+                self.model,
             )
             exported_programs = {}
 
@@ -267,7 +321,7 @@ def export(
             )
 
             token_embedding_exported_program = torch.export.export(
-                getattr(self.model, self.decoder_name).get_input_embeddings(),
+                self.model.get_input_embeddings(),
                 args=(input_ids,),
                 kwargs={},
                 dynamic_shapes=dynamic_shapes,
@@ -281,13 +335,13 @@ def export(
 
             if self.modality == "audio":
                 encoder = AudioExportableModule(self.model)
-                input_features, dynamic_shapes = encoder.prepare_export_inputs()
             elif self.modality == "vision":
-                raise ValueError("Vision is not yet supported, this will be available soon.")
+                encoder = VisionExportableModule(self.model)
             else:
                 raise ValueError(
                     f"{self.model.config.name_or_path} has an unsupported modality that is not supported yet for Optimum - please file an issue."
                 )
+            input_features, dynamic_shapes = encoder.prepare_export_inputs()
 
             logging.info(
                 f"Exporting {self.modality} encoder using input_features({input_features.shape}), dynamic_shapes={dynamic_shapes}"
@@ -388,10 +442,6 @@ def export(
             f"Exporting using input_ids({input_ids.shape})={input_ids}, cache_position({cache_position.shape})={cache_position}, dynamic_shapes={dynamic_shapes}, strict={strict}"
         )
 
-        from transformers.integrations.executorch import (
-            TorchExportableModuleForDecoderOnlyLM,
-        )
-
         exportable_module = TorchExportableModuleForDecoderOnlyLM(
             self.model,
             max_batch_size=1,
 
@@ -17,7 +17,7 @@
 import os.path
 
 import torchao
-from transformers import AutoConfig, AutoModel, GenerationConfig
+from transformers import AutoConfig, AutoModelForPreTraining, GenerationConfig
 
 from ..integrations import MultiModalTextToTextExportableModule
 from ..quantization import quantize_model_
@@ -159,21 +159,27 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
     if hasattr(config, "use_cache") and config.use_cache is False:
         config.use_cache = True
 
-    eager_model = AutoModel.from_pretrained(
+    # Using `AutoModelForPreTraining` since it usually routes to the correct model variant and there is no
+    # auto model class that captures both audio and image.
+    # The correct model variant we are looking for is <Model>ForConditionalGeneration, since it is the top-level
+    # model and thus will always contain the necessary model components. As an example of why this is needed,
+    # if you just use Gemma3Model instead of Gemma3ForConditionalGeneration, Gemma3Model (which is the decoder part)
+    # will not contain the LM head, which is only applied in the latter.
+    eager_model = AutoModelForPreTraining.from_pretrained(
         model_name_or_path,
         device_map=device,
         torch_dtype=dtype,
         config=config,
         attn_implementation=attn_implementation,
-        generation_config=GenerationConfig(
-            use_cache=True,
-            cache_implementation=cache_implementation,
-            max_length=max_length,
-            cache_config={
-                "batch_size": batch_size,
-                "max_cache_len": max_length,
-            },
-        ),
+    )
+    eager_model.generation_config = GenerationConfig(
+        use_cache=True,
+        cache_implementation=cache_implementation,
+        max_length=max_length,
+        cache_config={
+            "batch_size": batch_size,
+            "max_cache_len": max_length,
+        },
     )
     decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)
     encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
 
@@ -71,6 +71,30 @@ def save_config_to_constant_methods(
     return {k: v for k, v in {**metadata, **kwargs}.items() if v is not None}
 
 
+def apply_chat_template_with_fallback(processor, conversation, **kwargs):
+    """
+    Apply chat template with fallback for external processors.
+
+    For duck-typed external processors that aren't defined in Transformers, e.g.
+    Voxtral's processor which is defined in mistral-common.
+    These processors aren't guaranteed to have some of the other kwargs such as
+    "add_generation_prompt".
+
+    Args:
+        processor: The processor instance
+        conversation: The conversation to process
+        **kwargs: Additional keyword arguments to pass to apply_chat_template
+
+    Returns:
+        The processed inputs from apply_chat_template
+    """
+    try:
+        return processor.apply_chat_template(conversation, **kwargs)
+    except ValueError:
+        # Fallback for external processors - just pass the conversation
+        return processor.apply_chat_template(conversation)
+
+
 def verify_eos_tokens_in_pretrained_tokenizer(model_eos_ids: List[int], tokenizer: PreTrainedTokenizer) -> bool:
     """
     Verifies that the model's EOS token IDs are present in the tokenizer's