Add Granite Speech 3.3 (#144)

jackzhxng · web-flow · commit d4d3046738ca · 2025-09-19T18:33:39.000-04:00
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -45,7 +45,10 @@
 
 from ..exporters import TasksManager
 from ..exporters.executorch import main_export
-from ..exporters.executorch.utils import apply_chat_template_with_fallback, verify_eos_tokens_in_pretrained_tokenizer
+from ..exporters.executorch.utils import (
+    process_conversation_inputs,
+    verify_eos_tokens_in_pretrained_tokenizer,
+)
 from ..modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
 from ..utils.file_utils import find_files_matching_pattern
 from .stats import Stats
@@ -88,7 +91,11 @@ class ExecuTorchModelBase(OptimizedModel, ABC):
 
     auto_model_class = None
 
-    def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
+    def __init__(
+        self,
+        models: Dict[str, "ExecuTorchModule"],
+        config: "PretrainedConfig",
+    ):
         super().__init__(model=None, config=config)
 
         if self.__class__.auto_model_class is None:
@@ -444,7 +451,11 @@ class ExecuTorchModelForSeq2SeqLM(ExecuTorchModelBase):
 
     auto_model_class = AutoModelForSeq2SeqLM
 
-    def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
+    def __init__(
+        self,
+        models: Dict[str, "ExecuTorchModule"],
+        config: "PretrainedConfig",
+    ):
         super().__init__(models=models, config=config)
         if not hasattr(self, "encoder"):
             raise AttributeError("Expected attribute 'encoder' not found in the instance.")
@@ -640,7 +651,11 @@ class ExecuTorchModelForCausalLM(ExecuTorchModelBase):
 
     auto_model_class = AutoModelForCausalLM
 
-    def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
+    def __init__(
+        self,
+        models: Dict[str, "ExecuTorchModule"],
+        config: "PretrainedConfig",
+    ):
         super().__init__(models, config)
         if not hasattr(self, "model"):
             raise AttributeError("Expected attribute 'model' not found in the instance.")
@@ -862,7 +877,11 @@ class ExecuTorchModelForMaskedLM(ExecuTorchModelBase):
 
     auto_model_class = AutoModelForMaskedLM
 
-    def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
+    def __init__(
+        self,
+        models: Dict[str, "ExecuTorchModule"],
+        config: "PretrainedConfig",
+    ):
         super().__init__(models, config)
         if not hasattr(self, "model"):
             raise AttributeError("Expected attribute 'model' not found in the instance.")
@@ -934,7 +953,11 @@ class ExecuTorchModelForImageClassification(ExecuTorchModelBase):
 
     auto_model_class = AutoModelForImageClassification
 
-    def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
+    def __init__(
+        self,
+        models: Dict[str, "ExecuTorchModule"],
+        config: "PretrainedConfig",
+    ):
         super().__init__(models, config)
         if not hasattr(self, "model"):
             raise AttributeError("Expected attribute 'model' not found in the instance.")
@@ -993,7 +1016,11 @@ class ExecuTorchModelForSpeechSeq2Seq(ExecuTorchModelBase):
 
     auto_model_class = AutoModelForSpeechSeq2Seq
 
-    def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
+    def __init__(
+        self,
+        models: Dict[str, "ExecuTorchModule"],
+        config: "PretrainedConfig",
+    ):
         super().__init__(models=models, config=config)
         if not hasattr(self, "encoder"):
             raise AttributeError("Expected attribute 'encoder' not found in the instance.")
@@ -1172,7 +1199,11 @@ class ExecuTorchModelForMultiModalToText(ExecuTorchModelBase):
     # task type. For MultiModal, we should always be specifying the task type anyways.
     auto_model_class = AutoModel
 
-    def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedConfig"):
+    def __init__(
+        self,
+        models: Dict[str, "ExecuTorchModule"],
+        config: "PretrainedConfig",
+    ):
         super().__init__(models=models, config=config)
         required_methods = ["text_decoder", "token_embedding"]
         for required_method in required_methods:
@@ -1329,13 +1360,10 @@ def text_generation(
         self.stats.reset()
         self.stats.on_inference_start()
 
-        inputs = apply_chat_template_with_fallback(
+        inputs = process_conversation_inputs(
             processor,
+            tokenizer,
             input_conversation,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
         )
 
         self.stats.on_token_encode_end()
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -20,6 +20,7 @@
 from torch.export import ExportedProgram
 from torch.nn.attention import SDPBackend
 from transformers import (
+    AutoConfig,
     AutoProcessor,
     PreTrainedModel,
     StaticCache,
@@ -88,30 +89,47 @@ def prepare_export_inputs(self):
         # 1. Get export inputs
         model_id = self.model.config.name_or_path
         processor = AutoProcessor.from_pretrained(model_id)
-        sample_conversation_with_audio = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "audio",
-                        "url": "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav",
-                    },
-                ],
-            }
-        ]
-        processed_inputs = apply_chat_template_with_fallback(
-            processor,
-            sample_conversation_with_audio,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
+        config = AutoConfig.from_pretrained(model_id)
+
+        if config.model_type == "granite_speech":
+            import torchaudio
+            from huggingface_hub import hf_hub_download
+
+            audio_path = hf_hub_download(repo_id=model_id, filename="10226_10111_000000.wav")
+            wav, _sampling_rate = torchaudio.load(audio_path, normalize=True)
+            processed_inputs = processor(
+                "",  # No text needed.
+                wav,
+                return_tensors="pt",
+            )
+        else:
+            sample_conversation_with_audio = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "audio",
+                            "url": "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav",
+                        },
+                    ],
+                }
+            ]
+            processed_inputs = apply_chat_template_with_fallback(
+                processor,
+                sample_conversation_with_audio,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
         if "input_features" not in processed_inputs:
             raise ValueError(
                 f"Unable to obtain sample audio encoder inputs for export for {model_id} - the processor did not return formatted inputs with the 'input_features' key: {processed_inputs}"
             )
         export_inputs = processed_inputs["input_features"]
+        # Make sure the export inputs has a batch size > 1 so that it doesn't 0/1 specialize.
+        if export_inputs.shape[0] == 1:
+            export_inputs = export_inputs.repeat(2, 1, 1)
 
         # 2. Get export dynamic shapes
         # For certain models like Voxtral, each 30 seconds represent one batch. So theoretically this caps
@@ -129,7 +147,11 @@ def forward(
         self,
         input_features: torch.FloatTensor,
     ):
-        audio_embeds = self.model.get_audio_embeds(input_features)
+        # TODO: remove on next Transformers pin bump.
+        if hasattr(self.model, "get_audio_embeds"):
+            audio_embeds = self.model.get_audio_embeds(input_features)
+        else:
+            audio_embeds = self.model.get_audio_features(input_features)
         return audio_embeds.unsqueeze(0)
 
 
@@ -164,8 +186,6 @@ def __init__(
     ):
         super().__init__()
 
-        if modality not in encoder_name:
-            raise ValueError(f'encoder_name "{encoder_name}" does not match specified modality "{modality}".')
         if not hasattr(model, encoder_name):
             raise ValueError(f'Model does not contain encoder "{encoder_name}".')
 
diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
@@ -39,6 +39,7 @@ def _validate_multimodal_components(model):
         "text_model",
     ]
     POTENTIAL_AUDIO_ENCODER_NAMES = [
+        "encoder",  # Here mainly for Granite Speech.
         "audio_tower",
         "audio_model",
     ]
@@ -146,12 +147,9 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         except (OSError, json.JSONDecodeError):
             processor_config = None
 
-    # Make sure config has text_config and vision_config:
-    if not (hasattr(config, "text_config") and (hasattr(config, "vision_config") or hasattr(config, "audio_config"))):
-        raise ValueError(
-            f"The model {model_name_or_path} does not have a `text_config` or `vision_config`/`audio_config` attribute in its config. "
-            "This is required for multimodal text-to-text models."
-        )
+    # Make sure config has text_config.
+    if not (hasattr(config, "text_config")):
+        raise ValueError(f"The model {model_name_or_path} does not have a `text_config`.")
 
     if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
         # NOTE: Avoid hitting the data-dependent control flow in _longrope_frequency_update.
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Set
+import copy
+import io
+import logging
+from typing import Any, Dict, List, Optional, Set
 
 import torch
+import transformers
 from transformers import GenerationConfig, PretrainedConfig
+from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils import PreTrainedTokenizer
 
 
@@ -126,3 +131,74 @@ def verify_eos_tokens_in_pretrained_tokenizer(model_eos_ids: List[int], tokenize
     is_valid = any(model_id in candidate_eos_ids for model_id in model_eos_ids)
 
     return is_valid
+
+
+def process_conversation_inputs(
+    processor: ProcessorMixin,
+    tokenizer: PreTrainedTokenizer,
+    input_conversation: List[Dict[str, Any]],
+):
+    """
+    Process input conversation for multimodal models.
+
+    This function handles the preprocessing of conversation inputs, with special handling for
+    GraniteSpeechProcessor which requires extracting and processing audio content from conversations
+    prior to feeding into the processor.
+
+    Args:
+        processor: The processor to use for input processing
+        tokenizer: The tokenizer to use for text processing
+        input_conversation: List of conversation messages, may contain audio content
+
+    Returns:
+        Processed inputs ready for model consumption
+    """
+    if isinstance(processor, transformers.models.granite_speech.processing_granite_speech.GraniteSpeechProcessor):
+        import requests
+        import torchaudio
+
+        conversation = copy.deepcopy(input_conversation)
+        audio_path = None
+
+        # Extract audio content and remove from conversation
+        audio_items = [(i, item) for i, item in enumerate(conversation) if item.get("type") == "audio"]
+        if audio_items:
+            idx, audio_item = audio_items[0]
+            audio_path = audio_item["content"]
+            # Remove the audio content from the input conversation since it
+            # is handled outside for Granite.
+            del conversation[idx]
+        else:
+            raise ValueError("No audio content found in conversation")
+
+        # Download and process audio
+        try:
+            resp = requests.get(audio_path)
+            resp.raise_for_status()
+            buf = io.BytesIO(resp.content)
+        except requests.exceptions.RequestException:
+            print("Could not download input audio file.")
+
+        wav, sampling_rate = torchaudio.load(buf, normalize=True)
+        if wav.shape[0] != 1:
+            wav = wav.mean(dim=0, keepdim=True)  # Convert stereo to mono.
+            logging.warning("Resampled audio stereo to mono")
+        if sampling_rate != 16000:
+            wav = torchaudio.functional.resample(wav, sampling_rate, 16000)
+            logging.warning(f"Resampled audio from {sampling_rate}Hz to 16000Hz")
+
+        # Generate text prompt and process with audio
+        prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+        inputs = processor(prompt, wav, return_tensors="pt")
+    else:
+        # Standard processing for other processors
+        inputs = apply_chat_template_with_fallback(
+            processor,
+            input_conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
+    return inputs
diff --git a/tests/models/test_modeling_granite_speech.py b/tests/models/test_modeling_granite_speech.py
diff --git a/tests/utils.py b/tests/utils.py