openvinotoolkit · Ashitpatel001 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/notebooks/benchmark-transformer-quantization/bench/inputs.py b/notebooks/benchmark-transformer-quantization/bench/inputs.py
@@ -0,0 +1,40 @@
+import torch
+import numpy as np
+
+class TransformerInputGenerator:
+    """
+    Generates deterministic, semantically valid inputs to prevent
+    KV-cache crashes (e.g., random beam_idx issues).
+    """
+    def __init__(self, tokenizer, batch_size=1, seq_len=128):
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+
+    def get_inputs(self):
+        """
+        Returns a dictionary of inputs.
+        CRITICAL: beam_idx is initialized to zeros for greedy search.
+        Random beam_idx causes 'ScaledDotProductAttention' crashes.
+        """
+
+        vocab_size = self.tokenizer.vocab_size
+        input_ids = torch.arange(0, self.seq_len).repeat(self.batch_size, 1) % vocab_size
+
+        # 2. Valid Attention Mask (No padding for benchmarking)
+        attention_mask = torch.ones((self.batch_size, self.seq_len), dtype=torch.long)
+
+        # 3. Position IDs (Strict sequential ordering)
+        position_ids = torch.arange(0, self.seq_len, dtype=torch.long).unsqueeze(0).repeat(self.batch_size, 1)
+
+        # 4. Beam Index 
+        # Must strictly be valid indices [0, num_beams-1].
+        # For standard benchmarking, we assume greedy/single beam -> All Zeros.
+        beam_idx = torch.zeros((self.batch_size, self.seq_len), dtype=torch.int32)
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "beam_idx": beam_idx
+        }
diff --git a/notebooks/benchmark-transformer-quantization/bench/kv_cache.py b/notebooks/benchmark-transformer-quantization/bench/kv_cache.py
@@ -0,0 +1,24 @@
+import gc
+import torch
+
+class KVCacheManager:
+    """
+    Helper to manage memory state between benchmark runs.
+    """
+    @staticmethod
+    def clear():
+        """
+        Aggressively clears PyTorch and Python garbage to prevent 
+        OOM (Out of Memory) during model swapping.
+        """
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    @staticmethod
+    def validate_config(model_config):
+        """
+        Ensures the model config has use_cache=True.
+        """
+        if hasattr(model_config, "use_cache") and not model_config.use_cache:
+            print("WARNING: Model 'use_cache' is False. Benchmarking generation without KV-cache is slow.")
diff --git a/notebooks/benchmark-transformer-quantization/bench/metrics.py b/notebooks/benchmark-transformer-quantization/bench/metrics.py
@@ -0,0 +1,23 @@
+import psutil
+import os
+from pathlib import Path
+
+class MetricsCollector:
+    def __init__(self):
+        self.process = psutil.Process(os.getpid())
+        self.baseline_rss = self.process.memory_info().rss
+
+    def get_current_rss_mb(self):
+        """Returns Resident Set Size (Physical RAM) in MB."""
+        return self.process.memory_info().rss / (1024 * 1024)
+
+    def get_ram_growth_mb(self):
+        """How much RAM has this process consumed since init?"""
+        return self.get_current_rss_mb() - (self.baseline_rss / 1024 / 1024)
+
+    @staticmethod
+    def get_directory_size_mb(directory):
+        """Calculates disk footprint of the exported model."""
+        root_directory = Path(directory)
+        if not root_directory.exists(): return 0
+        return sum(f.stat().st_size for f in root_directory.glob('**/*') if f.is_file()) / (1024 * 1024)
diff --git a/notebooks/benchmark-transformer-quantization/bench/model_loader.py b/notebooks/benchmark-transformer-quantization/bench/model_loader.py
@@ -0,0 +1,65 @@
+import time
+from pathlib import Path
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from optimum.intel import OVModelForCausalLM
+
+class ModelFactory:
+    @staticmethod
+    def load_pytorch(model_id):
+        print(f"Loading PyTorch Model: {model_id}...")
+        start = time.perf_counter()
+
+        # Load model and tokenizer
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        load_time = time.perf_counter() - start
+
+        # Return 4 values to match notebook expectations (Metric is None for PyTorch)
+        return model, tokenizer, load_time, None
+
+    @staticmethod
+    def load_openvino(model_id, precision="int4", device="CPU", cache_dir="./ov_cache"):
+        """
+        Loads an OpenVINO model with device selection.
+        """
+        print(f" Loading OpenVINO Model: {model_id} on {device}...")
+        start = time.perf_counter()
+
+        # 1. Determine Cache Path vs Direct Download
+        if precision == "already_quantized":
+            # FAST PATH: Load pre-optimized model directly from HF
+            model = OVModelForCausalLM.from_pretrained(
+                model_id,
+                device=device  # <--- UPDATED: Uses the widget selection
+            )
+            cache_path = Path(cache_dir) / "pre_quantized_download"
+        else:
+            # SLOW PATH: Local Export & Compression
+            cache_path = Path(cache_dir) / f"{model_id.replace('/', '_')}_{precision}"
+
+            export_config = {"trust_remote_code": True}
+            if precision == "int4":
+                export_config["load_in_4bit"] = True
+                export_config["quantization_config"] = {"bits": 4, "sym": True, "group_size": 128}
+
+            if not cache_path.exists():
+                print("   ↳ Exporting to IR (this may take time)...")
+                model = OVModelForCausalLM.from_pretrained(
+                    model_id, 
+                    export=True,
+                    device=device,  # UPDATED: Uses the widget selection
+                    **export_config
+                )
+                model.save_pretrained(cache_path)
+            else:
+                model = OVModelForCausalLM.from_pretrained(
+                    cache_path,
+                    device=device   # UPDATED: Uses the widget selection
+                )
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        load_time = time.perf_counter() - start
+
+        # Return 4 values (Cache path is the metric here)
+        return model, tokenizer, load_time, cache_path
diff --git a/notebooks/benchmark-transformer-quantization/bench/quantization.py b/notebooks/benchmark-transformer-quantization/bench/quantization.py
@@ -0,0 +1,31 @@
+from transformers import BitsAndBytesConfig
+
+class QuantizationConfigFactory:
+    """
+    Returns valid quantization configurations for OpenVINO/NNCF.
+    """
+    @staticmethod
+    def get_int4_config():
+        """
+        Returns NNCF Weight Compression config for 4-bit.
+        Sym=True and GroupSize=128 are standard for CPU accuracy/speed balance.
+        """
+        return {
+            "load_in_4bit": True,
+            "quantization_config": {
+                "bits": 4, 
+                "sym": True, 
+                "group_size": 128,
+                "ratio": 1.0,
+            }
+        }
+
+
+    @staticmethod
+    def get_int8_config():
+        """
+        Returns NNCF Weight Compression config for 8-bit.
+        """
+        return {
+            "load_in_8bit": True,
+        }
diff --git a/notebooks/benchmark-transformer-quantization/bench/runner.py b/notebooks/benchmark-transformer-quantization/bench/runner.py
@@ -0,0 +1,52 @@
+import torch
+import time
+import numpy as np
+from tqdm import tqdm
+from .inputs import TransformerInputGenerator
+from .metrics import MetricsCollector
+from .kv_cache import KVCacheManager
+
+class BenchmarkRunner:
+    def __init__(self, model, tokenizer, framework="pt"):
+        self.model = model
+        self.generator = TransformerInputGenerator(tokenizer)
+        self.framework = framework
+        self.metrics = MetricsCollector()
+
+        # Validate config on startup
+        if hasattr(model, "config"):
+            KVCacheManager.validate_config(model.config)
+
+    def run(self, num_iters=50, warmup=5):
+        # Clear memory before starting
+        KVCacheManager.clear()
+
+        inputs = self.generator.get_inputs()
+
+        print(f" Starting Benchmark [{self.framework}]...")
+
+        # 1. Warmup
+        for _ in range(warmup):
+            with torch.no_grad():
+                self.model(**inputs)
+
+        # 2. Measurement Loop
+        latencies = []
+        start_global = time.perf_counter()
+
+        # tqdm progress bar
+        for _ in tqdm(range(num_iters), desc=f"{self.framework} Bench"):
+            t0 = time.perf_counter()
+            with torch.no_grad():
+                self.model(**inputs)
+            latencies.append((time.perf_counter() - t0) * 1000) # ms
+
+        duration = time.perf_counter() - start_global
+
+        # 3. CRITICAL: THIS RETURN BLOCK MUST EXIST!
+        return {
+            "p50_ms": np.median(latencies),
+            "p99_ms": np.percentile(latencies, 99),
+            "throughput_ips": num_iters / duration,
+            "ram_usage_mb": self.metrics.get_current_rss_mb()
+        }
diff --git a/notebooks/benchmark-transformer-quantization/bench/utils.py b/notebooks/benchmark-transformer-quantization/bench/utils.py
@@ -0,0 +1,22 @@
+import sys
+import platform
+import pkg_resources
+import torch
+import openvino as ov
+
+def check_environment():
+    print(f"System: {platform.system()} {platform.release()}")
+    print(f"Python: {sys.version.split()[0]}")
+    print(f"OpenVINO: {ov.get_version()}")
+    print(f"Torch: {torch.__version__}")
+
+    # Critical Check: Ensure we have the right extension for NPU/CPU
+    available_devices = ov.Core().available_devices
+    print(f" Available AI Accelerators: {available_devices}")
+
+    if "NPU" in available_devices:
+        print(" NPU Detected (Intel Core Ultra/Meteor Lake)")
+    elif "GPU" in available_devices:
+        print(" iGPU Detected")
+    else:
+        print(" CPU Only (Performance will be baseline)")
diff --git a/notebooks/benchmark-transformer-quantization/benchmark-transformer-notebook.ipynb b/notebooks/benchmark-transformer-quantization/benchmark-transformer-notebook.ipynb
diff --git a/notebooks/benchmark-transformer-quantization/configs/benchmark_config.yaml b/notebooks/benchmark-transformer-quantization/configs/benchmark_config.yaml
@@ -0,0 +1,16 @@
+experiment_name: "Qwen2.5_Benchmark_CPUBaseline" #you can change it to any huggingface model id
+
+model:
+  id: "Qwen/Qwen2.5-0.5B-Instruct"
+  task: "text-generation"
+
+benchmark:
+  warmup_iterations: 3
+  measure_iterations: 15
+  batch_size: 1
+  sequence_length: 128
+
+export:
+  cache_dir: "./ov_cache"
+  precision: "int4"  # Options: int4, int8, fp32
+
diff --git a/notebooks/benchmark-transformer-quantization/image.png b/notebooks/benchmark-transformer-quantization/image.png
diff --git a/notebooks/benchmark-transformer-quantization/readme.md b/notebooks/benchmark-transformer-quantization/readme.md
@@ -0,0 +1,47 @@
+# ⚡ Transformer Quantization Benchmark: PyTorch vs. OpenVINO
+
+![Python](https://img.shields.io/badge/Python-3.8%2B-blue)
+![OpenVINO](https://img.shields.io/badge/OpenVINO-2025.0-purple)
+![PyTorch](https://img.shields.io/badge/PyTorch-2.0%2B-red)
+![License](https://img.shields.io/badge/License-Apache_2.0-green)
+
+A modular benchmarking framework designed to measure the **inference performance gap** between standard PyTorch (FP32) and optimized OpenVINO (INT4) runtimes for Large Language Models (LLMs) on CPU.
+
+> **Key Result:** Achieved a **~10.39x Speedup** and **3.1x Storage Reduction** on Qwen2.5-0.5B-Instruct.
+
+## 🚀 Overview
+
+Running LLMs on consumer hardware (laptops, edge devices) is challenging due to high latency and memory usage. This project demonstrates how **NNCF (Neural Network Compression Framework)** and **INT4 Quantization** can unlock real-time performance on standard CPUs.
+
+**This benchmark measures:**
+* **Latency (P50):** Median time to generate a token (Chat responsiveness).
+* **Throughput:** Total tokens generated per second.
+* **Memory Footprint:** RAM usage during generation.
+* **Disk Size:** Exact storage efficiency (Compression Rate) calculated via physics-based analysis.
+
+## ✨ Key Features
+* **Self-Contained Setup:** No manual terminal commands required. The notebook automatically detects and installs all necessary dependencies (`openvino`, `torch`, `optimum-intel`).
+* **Interactive Controls:** Built-in widgets allow you to select your target **Device** (CPU/GPU) and **Quantization Precision** (INT4, INT8, FP16) dynamically.
+* **Robust Metrics:** Implements aggressive Garbage Collection (GC) and "Physics-Based" size calculation to ensure 100% accurate comparisons across different hardware.
+
+## 📊 Benchmark Results (Sample)
+
+![alt text](image.png)
+
+## 📂 Project Structure
+
+```text
+benchmark-transformer-quantization/
+├── bench/
+│   ├── inputs.py         # Input processing & tokenization helpers
+│   ├── kv_cache.py       # Memory management & Garbage collection
+│   ├── metrics.py        # System-level monitoring (RAM/Disk)
+│   ├── model_loader.py   # Handles FP32 loading & INT4 export
+│   ├── quantization.py   # NNCF quantization logic & compression
+│   ├── runner.py         # Warmup & Measurement loop
+│   └── utils.py          # General utility functions
+├── configs/
+│   └── benchmark_config.yaml  # Easy-to-tune parameters
+├── benchmark-transformer-notebook.ipynb  # 📖 Main Interactive Notebook
+├── image.png             # Result visualization
+└── README.md             # overview of whole folder