huggingface
diff --git a/‎README.md‎
Lines changed: 11 additions & 1 deletion b/‎README.md‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎optimum/exporters/executorch/tasks/causal_lm.py‎
Lines changed: 52 additions & 15 deletions b/‎optimum/exporters/executorch/tasks/causal_lm.py‎
Lines changed: 52 additions & 15 deletions
diff --git a/‎tests/models/test_modeling_codegen.py‎
Lines changed: 74 additions & 0 deletions b/‎tests/models/test_modeling_codegen.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎tests/models/test_modeling_glm.py‎
Lines changed: 72 additions & 0 deletions b/‎tests/models/test_modeling_glm.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎tests/models/test_modeling_gpt2.py‎
Lines changed: 72 additions & 0 deletions b/‎tests/models/test_modeling_gpt2.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎tests/models/test_modeling_gptj.py‎
Lines changed: 74 additions & 0 deletions b/‎tests/models/test_modeling_gptj.py‎
Lines changed: 74 additions & 0 deletions
@@ -151,16 +151,26 @@ We currently support a wide range of popular transformer models, including encod
 - [Eurobert](https://huggingface.co/EuroBERT/EuroBERT-210m): `EuroBERT-210m` and its variants
 - [Roberta](https://huggingface.co/FacebookAI/xlm-roberta-base): FacebookAI's `xlm-roberta-base` and its variants
 #### Decoder-only models
+- [Codegen](https://huggingface.co/Salesforce/codegen-350M-mono): Salesforce's `codegen-350M-mono` and its variants
 - [Gemma](https://huggingface.co/google/gemma-2b): `Gemma-2b` and its variants
 - [Gemma2](https://huggingface.co/google/gemma-2-2b): `Gemma-2-2b` and its variants
-- [Gemma3](https://huggingface.co/google/gemma-3-1b-it): `Gemma-3-1b` and its variants *(requires `transformers >= 4.52.0`)*
+- [Gemma3](https://huggingface.co/google/gemma-3-1b-it): `Gemma-3-1b` and its variants
+- [Glm](https://huggingface.co/THUDM/glm-edge-1.5b-chat): `glm-edge-1.5b` and its variants
+- [Gpt2](https://huggingface.co/AI-Sweden-Models/gpt-sw3-126m): `gpt-sw3-126m` and its variants
+- [GptJ](https://huggingface.co/Milos/slovak-gpt-j-405M): `gpt-j-405M` and its variants
+- [GptNeoX](https://huggingface.co/EleutherAI/pythia-14m): EleutherAI's `pythia-14m` and its variants
+- [GptNeoXJapanese](https://huggingface.co/abeja/gpt-neox-japanese-2.7b): `gpt-neox-japanese-2.7b` and its variants
+- [Granite](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct): `granite-3.3-2b-instruct` and its variants
 - [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B): `Llama-3.2-1B` and its variants
+- [Mistral](https://huggingface.co/ministral/Ministral-3b-instruct): `Ministral-3b-instruct` and its variants
 - [Qwen2](https://huggingface.co/Qwen/Qwen2.5-0.5B): `Qwen2.5-0.5B` and its variants
 - [Qwen3](https://huggingface.co/Qwen/Qwen3-0.6B): `Qwen3-0.6B`, `Qwen3-Embedding-0.6B` and other variants
 - [Olmo](https://huggingface.co/allenai/OLMo-1B-hf): `OLMo-1B-hf` and its variants
+- [Phi](https://huggingface.co/johnsnowlabs/JSL-MedPhi2-2.7B): `JSL-MedPhi2-2.7B` and its variants
 - [Phi4](https://huggingface.co/microsoft/Phi-4-mini-instruct): `Phi-4-mini-instruct` and its variants
 - [Smollm](https://huggingface.co/HuggingFaceTB/SmolLM2-135M): 🤗 `SmolLM2-135M` and its variants
 - [Smollm3](https://huggingface.co/HuggingFaceTB/SmolLM3-3B): 🤗 `SmolLM3-3B` and its variants
+- [Starcoder2](https://huggingface.co/bigcode/starcoder2-3b): `starcoder2-3b` and its variants
 #### Encoder-decoder models
 - [T5](https://huggingface.co/google-t5/t5-small): Google's `T5` and its variants
 
 
@@ -71,22 +71,59 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
     if hasattr(config, "use_cache") and config.use_cache is False:
         config.use_cache = True
 
-    eager_model = AutoModelForCausalLM.from_pretrained(
+    def _load_eager_pretrained(
         model_name_or_path,
-        device_map=device,
-        torch_dtype=dtype,
-        config=config,
-        attn_implementation=attn_implementation,
-        generation_config=GenerationConfig(
-            use_cache=True,
-            cache_implementation=cache_implementation,
-            max_length=max_length,
-            cache_config={
-                "batch_size": batch_size,
-                "max_cache_len": max_length,
-            },
-        ),
-    )
+        device,
+        dtype,
+        config,
+        attn_implementation,
+        cache_implementation,
+        batch_size,
+        max_length,
+    ):
+        eager_model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            device_map=device,
+            torch_dtype=dtype,
+            config=config,
+            attn_implementation=attn_implementation,
+            generation_config=GenerationConfig(
+                use_cache=True,
+                cache_implementation=cache_implementation,
+                max_length=max_length,
+                cache_config={
+                    "batch_size": batch_size,
+                    "max_cache_len": max_length,
+                },
+            ),
+        )
+        return eager_model
+
+    try:
+        eager_model = _load_eager_pretrained(
+            model_name_or_path,
+            device,
+            dtype,
+            config,
+            attn_implementation,
+            cache_implementation,
+            batch_size,
+            max_length,
+        )
+    except ValueError as e:
+        if "torch.nn.functional.scaled_dot_product_attention" in str(e):
+            logging.info("⚠ SDPA attention not supported, falling back to eager implementation")
+            attn_implementation = "eager"
+            eager_model = _load_eager_pretrained(
+                model_name_or_path,
+                device,
+                dtype,
+                config,
+                attn_implementation,
+                cache_implementation,
+                batch_size,
+                max_length,
+            )
 
     for param in eager_model.parameters():
         # Must disable gradient for quantized checkpoint
 
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import logging
+import os
+import unittest
+
+import pytest
+import torchao
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
+from transformers import AutoConfig, AutoTokenizer
+from transformers.testing_utils import slow
+
+from optimum.executorch import ExecuTorchModelForCausalLM
+
+from ..utils import check_causal_lm_output_quality
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0"),
+        reason="Quantization is only available on torchao >= 0.11.0.",
+    )
+    def test_codegen_text_generation_with_8da4w_8we(self):
+        model_id = "Salesforce/codegen-350M-mono"
+        prompt = "def hello_world():"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        config = AutoConfig.from_pretrained(model_id)
+        config.bos_token_id = tokenizer.bos_token_id
+        config.eos_token_id = tokenizer.eos_token_id
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            config=config,
+            recipe="xnnpack",
+            **{"qlinear": True, "qembeeding": True},
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import logging
+import os
+import unittest
+
+import pytest
+import torchao
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
+from transformers import AutoTokenizer
+from transformers.testing_utils import slow
+
+from optimum.executorch import ExecuTorchModelForCausalLM
+
+from ..utils import check_causal_lm_output_quality
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0"),
+        reason="Quantization is only available on torchao >= 0.11.0.",
+    )
+    def test_glm_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
+        model_id = "THUDM/glm-edge-1.5b-chat"
+        prompt = "hello!"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            use_custom_kv_cache=True,
+            **{"qlinear": True, "qembeeding": True},
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import logging
+import os
+import unittest
+
+import pytest
+import torchao
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
+from transformers import AutoTokenizer
+from transformers.testing_utils import slow
+
+from optimum.executorch import ExecuTorchModelForCausalLM
+
+from ..utils import check_causal_lm_output_quality
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0"),
+        reason="Quantization is only available on torchao >= 0.11.0.",
+    )
+    def test_gpt2sw3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
+        model_id = "AI-Sweden-Models/gpt-sw3-126m"
+        prompt = "Träd är fina för att"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            use_custom_kv_cache=True,
+            **{"qlinear": True, "qembeeding": True},
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import logging
+import os
+import unittest
+
+import pytest
+import torchao
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
+from transformers import AutoConfig, AutoTokenizer
+from transformers.testing_utils import slow
+
+from optimum.executorch import ExecuTorchModelForCausalLM
+
+from ..utils import check_causal_lm_output_quality
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0"),
+        reason="Quantization is only available on torchao >= 0.11.0.",
+    )
+    def test_gptj_text_generation_with_8da4w_8we(self):
+        model_id = "Milos/slovak-gpt-j-405M"
+        prompt = "Tradičné jedlo na Orave sú"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        config = AutoConfig.from_pretrained(model_id)
+        config.bos_token_id = tokenizer.bos_token_id
+        config.eos_token_id = tokenizer.eos_token_id
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            config=config,
+            recipe="xnnpack",
+            **{"qlinear": True, "qembeeding": True},
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))