FP16 Calibration Script Upgrade (#1602)

nikita-savelyevv · web-flow · commit 4e70504f352a · 2024-01-19T09:26:16.000+04:00
* Added improvements to the fp16 calibration script

* Added default value for half type

* tqdm tweak

* Removed unneseccary dels

* Added note regarding fp16 weights

* Updated deepfloyd notebook calibration

* deepfloyd tweak

* Added calibration to llm-chatbot

* Added calibration to tiny-sd-unet

* Updated OV version for tiny-sd notebook

* Improved chat-llm calibration logic

* Fix the case when SNR is near-equal for many nodes

* Revert calibration code for red pajama; add inference precision hint instead

* Disable calibration verbosity for tiny-sd

* Tweaks

* Fix rag-chatbot

* Tweak spelling

* Update batch size for deepfloyd

* Add ov uninstall command

* Removed commented ops

* Removed pickled UNet input. Made it downloadable.
diff --git a/notebooks/238-deepfloyd-if/238-deep-floyd-if-convert.ipynb b/notebooks/238-deepfloyd-if/238-deep-floyd-if-convert.ipynb
@@ -116,11 +116,12 @@
     }
    ],
    "source": [
-    "# # Set up requirements\n",
+    "# Set up requirements\n",
     "\n",
+    "%pip uninstall -q -y openvino-dev openvino openvino-nightly\n",
     "%pip install -q --upgrade pip\n",
     "%pip install -q transformers \"diffusers>=0.16.1\" accelerate safetensors sentencepiece huggingface_hub  --extra-index-url https://download.pytorch.org/whl/cpu\n",
-    "%pip install -q \"openvino>=2023.2.0\" opencv-python\n",
+    "%pip install -q \"openvino-nightly\" opencv-python\n",
     "%pip install -q gradio"
    ]
   },
@@ -151,7 +152,7 @@
     "checkpoint_variant = 'fp16'\n",
     "model_dtype = torch.float32\n",
     "ir_input_type = ov.Type.f32\n",
-    "compress_to_fp16 = False\n",
+    "compress_to_fp16 = True\n",
     "\n",
     "models_dir = Path('./models')\n",
     "models_dir.mkdir(exist_ok=True)\n",
@@ -872,9 +873,10 @@
     "if 'GPU' in core.available_devices and not is_model_partially_upcasted(encoder_ov_model):\n",
     "    example_input_prompt = 'ultra close color photo portrait of rainbow owl with deer horns in the woods'\n",
     "    text_inputs = stage_1.tokenizer(example_input_prompt, max_length=77, padding=\"max_length\", return_tensors=\"np\")\n",
-    "    upcasted_ov_model = partially_upcast_nodes_to_fp32(encoder_ov_model, text_inputs.input_ids)\n",
+    "    upcasted_ov_model = partially_upcast_nodes_to_fp32(encoder_ov_model, text_inputs.input_ids, upcast_ratio=0.05,\n",
+    "                                                       operation_types=[\"MatMul\"], batch_size=10)\n",
     "    del encoder_ov_model\n",
-    "    gc.collect();\n",
+    "    gc.collect()\n",
     "\n",
     "    import os\n",
     "    os.remove(encoder_ir_path)\n",
diff --git a/notebooks/251-tiny-sd-image-generation/251-tiny-sd-image-generation.ipynb b/notebooks/251-tiny-sd-image-generation/251-tiny-sd-image-generation.ipynb
@@ -54,7 +54,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu torch torchvision \"openvino>=2023.1.0\" \"diffusers>=0.18.0\" \"transformers>=4.30.2\" \"gradio\" "
+    "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu torch torchvision \"openvino-nightly\" \"diffusers>=0.18.0\" \"transformers>=4.30.2\" \"gradio\""
    ]
   },
   {
@@ -907,13 +907,61 @@
     "text_enc = core.compile_model(TEXT_ENCODER_OV_PATH, device.value)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Calibrate UNet for GPU inference\n",
+    "\n",
+    "On a GPU device a model is executed in FP16 precision. For Tiny-SD UNet model there known to be accuracy issues caused by this. Therefore, a special calibration procedure is used to selectively mark some operations to be executed in full precision."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "import urllib.request\n",
+    "\n",
+    "# Fetch `model_upcast_utils` which helps to restore accuracy when inferred on GPU\n",
+    "urllib.request.urlretrieve(\n",
+    "    url='https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/main/notebooks/utils/model_upcast_utils.py',\n",
+    "    filename='model_upcast_utils.py'\n",
+    ")\n",
+    "\n",
+    "# Fetch an example input for UNet model needed for upcasting calibration process\n",
+    "urllib.request.urlretrieve(\n",
+    "    url='https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/pkl/unet_calibration_example_input.pkl',\n",
+    "    filename='unet_calibration_example_input.pkl'\n",
+    ")\n",
+    "from model_upcast_utils import is_model_partially_upcasted, partially_upcast_nodes_to_fp32\n",
+    "\n",
+    "unet_model = core.read_model(UNET_OV_PATH)\n",
+    "if 'GPU' in core.available_devices and not is_model_partially_upcasted(unet_model):\n",
+    "    with open(\"unet_calibration_example_input.pkl\", \"rb\") as f:\n",
+    "        example_input = pickle.load(f)\n",
+    "    unet_model = partially_upcast_nodes_to_fp32(unet_model, example_input, upcast_ratio=0.7,\n",
+    "                                                operation_types=[\"Convolution\"])\n",
+    "\n",
+    "    import os\n",
+    "    os.remove(UNET_OV_PATH)\n",
+    "    os.remove(str(UNET_OV_PATH).replace(\".xml\", \".bin\"))\n",
+    "    ov.save_model(unet_model, UNET_OV_PATH)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "unet_model = core.compile_model(UNET_OV_PATH, device.value)"
+    "unet_model = core.compile_model(unet_model, device.value)"
    ]
   },
   {
diff --git a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
@@ -736,10 +736,15 @@
     "    model_dir = fp16_model_dir\n",
     "print(f\"Loading model from {model_dir}\")\n",
     "\n",
-    "model_name = model_configuration[\"model_id\"]\n",
-    "class_key = model_id.value.split(\"-\")[0]\n",
     "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\", \"NUM_STREAMS\": \"1\", \"CACHE_DIR\": \"\"}\n",
     "\n",
+    "# On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy\n",
+    "# issues caused by this, which we avoid by setting precision hint to \"f32\".\n",
+    "if model_id.value == \"red-pajama-3b-chat\" and \"GPU\" in core.available_devices and device.value in [\"GPU\", \"AUTO\"]:\n",
+    "    ov_config[\"INFERENCE_PRECISION_HINT\"] = \"f32\"\n",
+    "\n",
+    "model_name = model_configuration[\"model_id\"]\n",
+    "class_key = model_id.value.split(\"-\")[0]\n",
     "tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
     "\n",
     "model_class = (\n",
diff --git a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
@@ -939,11 +939,16 @@
     "    model_dir = fp16_model_dir\n",
     "print(f\"Loading model from {model_dir}\")\n",
     "\n",
+    "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\", \"NUM_STREAMS\": \"1\", \"CACHE_DIR\": \"\"}\n",
+    "\n",
+    "# On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy\n",
+    "# issues caused by this, which we avoid by setting precision hint to \"f32\".\n",
+    "if llm_model_id.value == \"red-pajama-3b-chat\" and \"GPU\" in core.available_devices and llm_device.value in [\"GPU\", \"AUTO\"]:\n",
+    "    ov_config[\"INFERENCE_PRECISION_HINT\"] = \"f32\"\n",
+    "\n",
     "model_name = llm_model_configuration[\"model_id\"]\n",
     "stop_tokens = llm_model_configuration.get(\"stop_tokens\")\n",
     "class_key = llm_model_id.value.split(\"-\")[0]\n",
-    "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\", \"NUM_STREAMS\": \"1\", \"CACHE_DIR\": \"\"}\n",
-    "\n",
     "tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
     "\n",
     "class StopOnTokens(StoppingCriteria):\n",
diff --git a/notebooks/utils/model_upcast_utils.py b/notebooks/utils/model_upcast_utils.py