From 894d859ac0a32963335c76f05dd7e869d87ad4e2 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 20 Jan 2025 19:23:14 +0400
Subject: [PATCH] distil whisper fix quantization (#2674)

---
 notebooks/distil-whisper-asr/distil-whisper-asr.ipynb      | 4 ++--
 notebooks/outetts-text-to-speech/ov_outetts_helper.py      | 6 +++++-
 notebooks/phi-3-vision/phi-3-vision.ipynb                  | 2 ++
 .../sparsity-optimization/sparsity-optimization.ipynb      | 4 +++-
 notebooks/stable-audio/stable-audio.ipynb                  | 2 +-
 .../stable-diffusion-v3/stable-diffusion-v3-torch-fx.ipynb | 3 ++-
 notebooks/whisper-asr-genai/whisper-asr-genai.ipynb        | 7 ++++++-
 7 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/notebooks/distil-whisper-asr/distil-whisper-asr.ipynb b/notebooks/distil-whisper-asr/distil-whisper-asr.ipynb
index fe07fd701dc..e055bb81486 100644
--- a/notebooks/distil-whisper-asr/distil-whisper-asr.ipynb
+++ b/notebooks/distil-whisper-asr/distil-whisper-asr.ipynb
@@ -1015,7 +1015,7 @@
     "### Quantize Distil-Whisper encoder and decoder models\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "Below we run the `quantize` function which calls `nncf.quantize` on Distil-Whisper encoder and decoder-with-past models. We don't quantize first-step-decoder because its share in whole inference time is negligible."
+    "Below we run the `quantize` function which calls `nncf.quantize` on Distil-Whisper encoder and decoder models. We don't quantize first-step-decoder because its share in whole inference time is negligible."
    ]
   },
   {
@@ -1154,7 +1154,7 @@
     "            # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n",
     "            advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95)\n",
     "        )\n",
-    "        ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_model.xml\")\n",
+    "        ov.save_model(quantized_decoder, quantized_model_path / \"openvino_decoder_model.xml\")\n",
     "        del quantized_decoder\n",
     "        del decoder_calibration_data\n",
     "        gc.collect()\n",
diff --git a/notebooks/outetts-text-to-speech/ov_outetts_helper.py b/notebooks/outetts-text-to-speech/ov_outetts_helper.py
index e2c8af93261..e451020ce09 100644
--- a/notebooks/outetts-text-to-speech/ov_outetts_helper.py
+++ b/notebooks/outetts-text-to-speech/ov_outetts_helper.py
@@ -5,7 +5,11 @@
 try:
     from outetts.version.v1.interface import InterfaceHF
     from outetts.version.v1.prompt_processor import PromptProcessor
-    from outetts.version.v1.model import HFModel
+
+    try:
+        from outetts.version.v1.model import HFModel
+    except ImportError:
+        from outetts.models.hf_model import HFModel
     from outetts.wav_tokenizer.audio_codec import AudioCodec
 
     updated_version = True
diff --git a/notebooks/phi-3-vision/phi-3-vision.ipynb b/notebooks/phi-3-vision/phi-3-vision.ipynb
index 97e53331a54..82a27fc0396 100644
--- a/notebooks/phi-3-vision/phi-3-vision.ipynb
+++ b/notebooks/phi-3-vision/phi-3-vision.ipynb
@@ -51,6 +51,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import platform\n",
+    "\n",
     "%pip install -q -U \"torch>=2.1\" \"torchvision\" \"transformers>=4.45\" \"protobuf>=3.20\" \"gradio>=4.26\" \"Pillow\" \"accelerate\" \"tqdm\"  --extra-index-url https://download.pytorch.org/whl/cpu\n",
     "%pip install --pre -qU \"openvino>=2024.6.0\" \"openvino-tokenizers>=2024.6.0\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
     "%pip install -q -U \"nncf>=2.14.0\"\n",
diff --git a/notebooks/sparsity-optimization/sparsity-optimization.ipynb b/notebooks/sparsity-optimization/sparsity-optimization.ipynb
index f032c5d7e65..683ff2f4b01 100644
--- a/notebooks/sparsity-optimization/sparsity-optimization.ipynb
+++ b/notebooks/sparsity-optimization/sparsity-optimization.ipynb
@@ -124,6 +124,8 @@
    },
    "outputs": [],
    "source": [
+    "import torch\n",
+    "\n",
     "# The following model has been quantized, sparsified using Optimum-Intel 1.7 which is enabled by OpenVINO and NNCF\n",
     "# for reproducibility, refer https://huggingface.co/OpenVINO/bert-base-uncased-sst2-int8-unstructured80\n",
     "model_id = \"OpenVINO/bert-base-uncased-sst2-int8-unstructured80\"\n",
@@ -133,7 +135,7 @@
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "\n",
     "# Let's take the model for a spin!\n",
-    "sentiment_classifier = pipeline(\"text-classification\", model=ov_model, tokenizer=tokenizer)\n",
+    "sentiment_classifier = pipeline(\"text-classification\", model=ov_model, tokenizer=tokenizer, device=torch.device(\"cpu\"))\n",
     "\n",
     "text = \"He's a dreadful magician.\"\n",
     "outputs = sentiment_classifier(text)\n",
diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb
index b807823726c..e32435c5ac2 100644
--- a/notebooks/stable-audio/stable-audio.ipynb
+++ b/notebooks/stable-audio/stable-audio.ipynb
@@ -64,7 +64,7 @@
    "source": [
     "import platform\n",
     "\n",
-    "%pip install -q \"torch>=2.2\" torchaudio einops einops-exts huggingface-hub k-diffusion pytorch_lightning alias-free-torch ema-pytorch transformers>=4.45 \"gradio>=4.19 --extra-index-url https://download.pytorch.org/whl/cpu\n",
+    "%pip install -q \"torch>=2.2\" \"torchaudio\" \"einops\" \"einops-exts\" \"huggingface-hub\" \"k-diffusion\" \"pytorch_lightning\" \"alias-free-torch\" \"ema-pytorch\" \"transformers>=4.45\" \"gradio>=4.19\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
     "%pip install -q --no-deps \"stable-audio-tools\"\n",
     "%pip install  -q \"nncf>=2.12.0\"\n",
     "if platform.system() == \"Darwin\":\n",
diff --git a/notebooks/stable-diffusion-v3/stable-diffusion-v3-torch-fx.ipynb b/notebooks/stable-diffusion-v3/stable-diffusion-v3-torch-fx.ipynb
index 4fed1daafed..0298e07882f 100644
--- a/notebooks/stable-diffusion-v3/stable-diffusion-v3-torch-fx.ipynb
+++ b/notebooks/stable-diffusion-v3/stable-diffusion-v3-torch-fx.ipynb
@@ -450,7 +450,8 @@
     "    ).shuffle(seed=42)\n",
     "\n",
     "    transformer_config = dict(pipe.transformer.config)\n",
-    "    del transformer_config[\"model\"]\n",
+    "    if \"model\" in transformer_config:\n",
+    "        del transformer_config[\"model\"]\n",
     "    wrapped_unet = UNetWrapper(pipe.transformer.model, transformer_config)\n",
     "    pipe.transformer = wrapped_unet\n",
     "    # Run inference for data collection\n",
diff --git a/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb b/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb
index b4ad9f06e58..67a763f542c 100644
--- a/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb
+++ b/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb
@@ -77,10 +77,15 @@
     "%pip install -q \"torch>=2.3\" \"torchvision>=0.18.1\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
     "%pip install -q -U \"transformers>=4.45\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
     "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
-    "%pip install --pre -q -U \"openvino>=2024.5.0\" \"openvino-tokenizers>=2024.5.0\" \"openvino-genai>=2024.5.0\"\n",
+    "%pip install --pre -q -U \"openvino>=2024.5.0\" \"openvino-tokenizers>=2024.5.0\" \"openvino-genai>=2024.5.0\"  --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
     "%pip install -q datasets  \"gradio>=4.0\" \"soundfile>=0.12\" \"librosa\" \"python-ffmpeg<=1.0.16\"\n",
     "%pip install -q \"nncf>=2.14.0\" \"jiwer\" \"typing_extensions>=4.9\"\n",
     "if platform.system() == \"Darwin\":\n",
+    "    %pip install -q \"numpy<2.0\"\n",
+    "\n",
+    "from transformers.utils.import_utils import is_tf_available\n",
+    "\n",
+    "if is_tf_available():\n",
     "    %pip install -q \"numpy<2.0\""
    ]
   },