openvinotoolkit · peterchen-intel · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 26, 2025
@@ -534,7 +534,7 @@ jobs:
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).tokenizers.test }}
             timeout: 60
           - name: 'API tests'
-            cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
+            cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "not eagle3" ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test || fromJSON(needs.smart_ci.outputs.affected_components).sampling.test || fromJSON(needs.smart_ci.outputs.affected_components).text_streamer.test }}
             timeout: 60
           - name: 'Rag tests'
@@ -551,6 +551,12 @@ jobs:
               python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
             timeout: 90
+          - name: 'EAGLE3 speculative decoding tests'
+            cmd: |
-            cmd: |
+            cmd: |
+              # FIXME: Installing from a personal fork is fragile. This is required because the official optimum-intel does not yet support EAGLE3 speculative decoding.
+              # Remove this and use the official optimum-intel release once https://github.com/huggingface/optimum-intel/pull/XXX is merged and released.
-            cmd: |
+            cmd: |
+              # FIXME: Installing from a personal fork is fragile. This is required because the official optimum-intel does not yet support EAGLE3 speculative decoding.
+              # Remove this and use the official optimum-intel release once https://github.com/huggingface/optimum-intel/pull/XXX is merged and released.
+              python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@ea9607daf32919024cdd4390deec9693a7b64d23
+              python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "eagle3"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).speculative_decoding.test }}
+            timeout: 90
     defaults:
       run:
         shell: bash

@@ -472,7 +472,7 @@ jobs:
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).tokenizers.test }}
             timeout: 60
           - name: 'API tests'
-            cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
+            cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "not eagle3" ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test || fromJSON(needs.smart_ci.outputs.affected_components).sampling.test || fromJSON(needs.smart_ci.outputs.affected_components).text_streamer.test }}
             timeout: 60
           - name: 'Rag tests'
@@ -489,6 +489,12 @@ jobs:
               python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
             timeout: 90
+          - name: 'EAGLE3 speculative decoding tests'
+            cmd: |
-            cmd: |
+            cmd: |
+              # FIXME: Using a personal fork of optimum-intel for EAGLE3 speculative decoding tests.
+              # Reason: Required changes are not yet merged upstream. See https://github.com/huggingface/optimum-intel/pull/<PR_NUMBER> (replace with actual PR/issue link).
+              # Remove this and use official optimum-intel release once changes are merged.
-            cmd: |
+            cmd: |
+              # FIXME: Using a personal fork of optimum-intel for EAGLE3 speculative decoding tests.
+              # Reason: Required changes are not yet merged upstream. See https://github.com/huggingface/optimum-intel/pull/<PR_NUMBER> (replace with actual PR/issue link).
+              # Remove this and use official optimum-intel release once changes are merged.
+              python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@ea9607daf32919024cdd4390deec9693a7b64d23
+              python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "eagle3"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).speculative_decoding.test }}
+            timeout: 90
     defaults:
       run:
         shell: bash

@@ -623,7 +623,7 @@ jobs:
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).tokenizers.test }}
             timeout: 60
           - name: 'API tests'
-            cmd: 'python -m pytest -s -v tests/python_tests/test_continuous_batching.py tests/python_tests/test_generation_config.py tests/python_tests/test_sampling.py tests/python_tests/test_text_streamer.py'
+            cmd: 'python -m pytest -s -v tests/python_tests/test_continuous_batching.py -k "not eagle3" tests/python_tests/test_generation_config.py tests/python_tests/test_sampling.py tests/python_tests/test_text_streamer.py'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test || fromJSON(needs.smart_ci.outputs.affected_components).sampling.test || fromJSON(needs.smart_ci.outputs.affected_components).text_streamer.test }}
             timeout: 60
           - name: 'Rag tests'
@@ -640,6 +640,12 @@ jobs:
               python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
             timeout: 90
+          - name: 'EAGLE3 speculative decoding tests'
+            cmd: |
-            cmd: |
+            cmd: |
+              # TODO: Using a personal fork of optimum-intel for EAGLE3 speculative decoding tests.
+              # Reason: [Add explanation here, e.g., "Required for feature X not yet merged upstream. See PR #123."]
+              # Remove this and use official optimum-intel release when upstream PR is merged.
-            cmd: |
+            cmd: |
+              # TODO: Using a personal fork of optimum-intel for EAGLE3 speculative decoding tests.
+              # Reason: [Add explanation here, e.g., "Required for feature X not yet merged upstream. See PR #123."]
+              # Remove this and use official optimum-intel release when upstream PR is merged.
+              python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@ea9607daf32919024cdd4390deec9693a7b64d23
+              python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "eagle3"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).speculative_decoding.test }}
+            timeout: 90
     defaults:
       run:
         shell: pwsh

@@ -35,6 +35,7 @@ def read_images(path: str) -> list[Tensor]:
 def main():
     parser = argparse.ArgumentParser(description="Help command")
     parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
+    parser.add_argument("-dm", "--draft_model", type=str, help="Path to draft model and tokenizers base directory")
     parser.add_argument("-p", "--prompt", type=str, default=None, help="Prompt")
     parser.add_argument("-pf", "--prompt_file", type=str, help="Read prompt from file")
     parser.add_argument("-i", "--image", type=str, default="image.jpg", help="Image")
@@ -61,6 +62,7 @@ def main():
     # Perf metrics is stored in VLMDecodedResults.
     # In order to get VLMDecodedResults instead of a string input should be a list.
     models_path = args.model
+    draft_model_path = args.draft_model
     images = read_images(args.image)
     device = args.device
     num_warmup = args.num_warmup
@@ -76,7 +78,13 @@ def main():
         scheduler_config = ov_genai.SchedulerConfig()
         scheduler_config.enable_prefix_caching = False
         scheduler_config.max_num_batched_tokens = sys.maxsize
-        pipe = ov_genai.VLMPipeline(models_path, device, scheduler_config=scheduler_config)
+
+        print("draft_model_path=", draft_model_path)
+        print("device=", device)
+        draft_model = ov_genai.draft_model(str(draft_model_path), device)
+        #pipe = ov_genai.VLMPipeline(models_path, device, scheduler_config=scheduler_config)
-        print("draft_model_path=", draft_model_path)
-        print("device=", device)
-        draft_model = ov_genai.draft_model(str(draft_model_path), device)
-        #pipe = ov_genai.VLMPipeline(models_path, device, scheduler_config=scheduler_config)
+        draft_model = ov_genai.draft_model(str(draft_model_path), device)
-        print("draft_model_path=", draft_model_path)
-        print("device=", device)
-        draft_model = ov_genai.draft_model(str(draft_model_path), device)
-        #pipe = ov_genai.VLMPipeline(models_path, device, scheduler_config=scheduler_config)
+        draft_model = ov_genai.draft_model(str(draft_model_path), device)
+        pipe = ov_genai.VLMPipeline(models_path, device, scheduler_config=scheduler_config, draft_model=draft_model)
+
 
     input_data = pipe.get_tokenizer().encode(prompt)
     prompt_token_size = input_data.input_ids.get_shape()[1]

@@ -65,13 +65,18 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     class ContinuousBatchingImpl;
 
     class ContinuousBatchingForSpeculativeDecodingImpl;
+    class ContinuousBatchingForEagle3DecodingImpl;
     class ContinuousBatchingForPromptLookupImpl;
     class SpeculativeDecodingImpl;
+    class Eagle3DecodingImpl;
     class PromptLookupImpl;
 
     friend class ContinuousBatchingForSpeculativeDecodingImpl;
+
     friend class ContinuousBatchingForPromptLookupImpl;
+    friend class ContinuousBatchingForEagle3DecodingImpl;
     friend class SpeculativeDecodingImpl;
+    friend class Eagle3DecodingImpl;
     friend class PromptLookupImpl;
 
     std::shared_ptr<IContinuousBatchingPipeline> m_impl;