From f0deb2a425685a306391c2b2237f1b19d0365f36 Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Thu, 6 Feb 2025 09:16:36 +0100 Subject: [PATCH] Add genai to sdxl-turbo (#2719) CVS-161647 --- notebooks/sdxl-turbo/sdxl-turbo.ipynb | 568 ++++++++++++-------------- 1 file changed, 255 insertions(+), 313 deletions(-) diff --git a/notebooks/sdxl-turbo/sdxl-turbo.ipynb b/notebooks/sdxl-turbo/sdxl-turbo.ipynb index 142dc8d20da..d2528b415dc 100644 --- a/notebooks/sdxl-turbo/sdxl-turbo.ipynb +++ b/notebooks/sdxl-turbo/sdxl-turbo.ipynb @@ -13,7 +13,7 @@ "\n", "Previously, we already discussed how to launch Stable Diffusion XL model using OpenVINO in the following [notebook](../stable-diffusion-xl), in this tutorial we will focus on the [SDXL-turbo](https://huggingface.co/stabilityai/sdxl-turbo) version. Additionally, to improve image decoding speed, we will use [Tiny Autoencoder](https://github.com/madebyollin/taesd), which is useful for real-time previewing of the SDXL generation process.\n", "\n", - "We will use a pre-trained model from the [Hugging Face Diffusers](https://huggingface.co/docs/diffusers/index) library. To simplify the user experience, the [Hugging Face Optimum Intel](https://huggingface.co/docs/optimum/intel/index) library is used to convert the models to OpenVINO™ IR format.\n", + "We will use a pre-trained model from the [Hugging Face Diffusers](https://huggingface.co/docs/diffusers/index) library. To simplify the user experience, the [Hugging Face Optimum Intel](https://huggingface.co/docs/optimum/intel/index) library is used to convert the models to OpenVINO™ IR format. For running the image generation we will use [OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai) that provides easy-to-use API.\n", "\n", "#### Table of contents:\n", "\n", @@ -57,8 +57,8 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \\\n", - "\"torch>=2.1\" transformers \"diffusers>=0.24.0\" \"git+https://github.com/huggingface/optimum-intel.git\" \"gradio>=4.19\" \"peft>=0.6.2\" \"openvino>=2023.3.0\"" + "%pip install -q -U --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly \"openvino>=2024.5\" \"openvino-genai>=2024.5\"\n", + "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \"torch>=2.1\" transformers \"diffusers>=0.24.0\" \"git+https://github.com/huggingface/optimum-intel.git\" \"gradio>=4.19\" \"peft>=0.6.2\" \"nncf>=2.14.0\"" ] }, { @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "fb8d69a7-56ca-4acd-8aa9-9a9f31b58496", "metadata": {}, "outputs": [], @@ -103,12 +103,17 @@ "from pathlib import Path\n", "import requests\n", "\n", + "\n", "if not Path(\"notebook_utils.py\").exists():\n", " r = requests.get(\n", " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", " )\n", " open(\"notebook_utils.py\", \"w\").write(r.text)\n", "\n", + "if not Path(\"cmd_helper.py\").exists():\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py\")\n", + " open(\"cmd_helper.py\", \"w\").write(r.text)\n", + "\n", "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n", "from notebook_utils import collect_telemetry\n", "\n", @@ -124,13 +129,18 @@ "cell_type": "code", "execution_count": null, "id": "e19f90d9-55d1-4e99-91c0-9f72e0240cf2", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ + "import gc\n", + "\n", "import torch\n", - "import openvino as ov\n", "from diffusers import AutoencoderTiny\n", - "import gc\n", + "import openvino as ov\n", + "\n", + "from cmd_helper import optimum_cli\n", "\n", "\n", "class VAEEncoder(torch.nn.Module):\n", @@ -139,16 +149,7 @@ " self.vae = vae\n", "\n", " def forward(self, sample):\n", - " return self.vae.encode(sample)\n", - "\n", - "\n", - "class VAEDecoder(torch.nn.Module):\n", - " def __init__(self, vae):\n", - " super().__init__()\n", - " self.vae = vae\n", - "\n", - " def forward(self, latent_sample):\n", - " return self.vae.decode(latent_sample)\n", + " return {\"latent_sample\": self.vae.encode(x=sample)[\"latents\"]}\n", "\n", "\n", "def convert_tiny_vae(model_id, output_path):\n", @@ -158,14 +159,10 @@ " ov_model = ov.convert_model(vae_encoder, example_input=torch.zeros((1, 3, 512, 512)))\n", " ov.save_model(ov_model, output_path / \"vae_encoder/openvino_model.xml\")\n", " tiny_vae.save_config(output_path / \"vae_encoder\")\n", - " vae_decoder = VAEDecoder(tiny_vae)\n", - " ov_model = ov.convert_model(vae_decoder, example_input=torch.zeros((1, 4, 64, 64)))\n", - " ov.save_model(ov_model, output_path / \"vae_decoder/openvino_model.xml\")\n", - " tiny_vae.save_config(output_path / \"vae_decoder\")\n", "\n", "\n", "if not skip_convert_model:\n", - " !optimum-cli export openvino --model $sdxl_model_id --task stable-diffusion-xl $model_dir --weight-format fp16\n", + " optimum_cli(sdxl_model_id, model_dir, additional_args={\"weight-format\": \"fp16\"})\n", " convert_tiny_vae(tae_id, model_dir)" ] }, @@ -180,7 +177,8 @@ "\n", "\n", "Text-to-image generation lets you create images using text description. To start generating images, we need to load models first.\n", - "To load an OpenVINO model and run an inference with Optimum and OpenVINO Runtime, you need to replace diffusers `StableDiffusionXLPipeline` with Optimum `OVStableDiffusionXLPipeline`. Pipeline initialization starts with using `from_pretrained` method, where a directory with OpenVINO models should be passed. Additionally, you can specify an inference device." + "We will use [OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai) that provides easy-to-use API for running text generation. Firstly we will create pipeline with `Text2ImagePipeline` and `Image2ImagePipeline`. You can see more details in [Image Python Generation Pipeline Example](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2025/0/samples/python/image_generation)\n", + "Then we just run `generate` method and get the image tokens and then convert them into the image using `Image.fromarray` from PIL." ] }, { @@ -195,10 +193,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "2fe98f06-2183-446a-8e38-c475073ded26", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c000481839854c3899769220d1cb469f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from notebook_utils import device_widget\n", "\n", @@ -210,25 +224,14 @@ { "cell_type": "code", "execution_count": 5, - "id": "0bc47bd6-6571-4ff2-b111-b68af66777c3", + "id": "a42c27d4-b531-4451-8232-83de74f6a694", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the vae_decoder to AUTO ...\n", - "Compiling the unet to AUTO ...\n", - "Compiling the text_encoder to AUTO ...\n", - "Compiling the text_encoder_2 to AUTO ...\n", - "Compiling the vae_encoder to AUTO ...\n" - ] - } - ], + "outputs": [], "source": [ - "from optimum.intel.openvino import OVStableDiffusionXLPipeline\n", + "import openvino_genai as ov_genai\n", "\n", - "text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value)" + "\n", + "text2image_pipe = ov_genai.Text2ImagePipeline(model_dir, device.value)" ] }, { @@ -237,7 +240,7 @@ "id": "67a6df12-966d-49f6-8987-776b1d451e20", "metadata": {}, "source": [ - "The pipeline interface is similar to original `StableDiffusionXLPipeline`. We should provide text prompt. The default number of steps is 50, while sdxl-turbo required only 1 step. According to the information provided in model card, model does not use negative prompt and guidance scale and this parameters should be disabled using `guidance_scale = 0`" + "We should provide text prompt. The default number of steps is 50, while sdxl-turbo required only 1 step. According to the information provided in model card, model does not use negative prompt and guidance scale and this parameters should be disabled using `guidance_scale = 0`" ] }, { @@ -248,22 +251,8 @@ "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "27ac4feb1313482dbbefd9603ae915a9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00" ] @@ -275,16 +264,19 @@ ], "source": [ "import numpy as np\n", + "from PIL import Image\n", + "\n", "\n", "prompt = \"cute cat\"\n", - "image = text2image_pipe(\n", + "image_tensor = text2image_pipe.generate(\n", " prompt,\n", " num_inference_steps=1,\n", " height=512,\n", " width=512,\n", " guidance_scale=0.0,\n", - " generator=np.random.RandomState(987),\n", - ").images[0]\n", + " generator=ov_genai.TorchGenerator(987),\n", + ")\n", + "image = Image.fromarray(image_tensor.data[0])\n", "image.save(\"cat.png\")\n", "image" ] @@ -310,7 +302,7 @@ "[back to top ⬆️](#Table-of-contents:)\n", "\n", "\n", - "Image-to-image generation lets you transform images to match the characteristics provided in the text description. We can reuse the already converted model for running the Image2Image generation pipeline. For that, we should replace `OVStableDiffusionXLPipeline` with `OVStableDiffusionXLImage2ImagePipeline`." + "Image-to-image generation lets you transform images to match the characteristics provided in the text description. We can reuse the already converted model for running the Image2Image generation pipeline. For that, we should replace `OVStableDiffusionXLPipeline` with `Image2ImagePipeline`. Also we convert the input image to `ov.Tensor` using `image_to_tensor` function. " ] }, { @@ -318,33 +310,9 @@ "execution_count": 8, "id": "75071ab3-ffc3-4de2-8edd-c8bcbd50f5b4", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the vae_decoder to AUTO ...\n", - "Compiling the unet to AUTO ...\n", - "Compiling the vae_encoder to AUTO ...\n", - "Compiling the text_encoder_2 to AUTO ...\n", - "Compiling the text_encoder to AUTO ...\n" - ] - } - ], - "source": [ - "from optimum.intel import OVStableDiffusionXLImg2ImgPipeline\n", - "\n", - "image2image_pipe = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_dir, device=device.value)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "4b9269e1-5bd4-4d26-8ee8-0df35c4e53bc", - "metadata": {}, "outputs": [], "source": [ - "photo_prompt = \"a cute cat with bow tie\"" + "image2image_pipe = ov_genai.Image2ImagePipeline(model_dir, device=device.value)" ] }, { @@ -353,58 +321,54 @@ "id": "d2bd1605-82fd-4be7-9384-db69641dcf0b", "metadata": {}, "source": [ - "`strength` parameter is important for the image-to-image generation pipeline. It is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. Values that approach 1.0 enable lots of variations but will also produce images that are not semantically consistent with the input, then close to 0, less noise will be added and the target image will preserve source image content. strength has an impact not only on a number of noise but also the number of generation steps. The number of denoising iterations in the image-to-image generation pipeline is calculated as `int(num_inference_steps * strength)`. With sdxl-turbo we should be careful with selecting `num_inference_steps` and `strength` to produce the correct result and make sure that the number of steps used in pipeline >= 1 after applying strength multiplication. e.g. in example below, we will use `num_inference_steps=2` and `stength=0.5`, finally, we get 0.5 * 2.0 = 1 step in our pipeline." + "`strength` parameter is important for the image-to-image generation pipeline. It is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. Values that approach 1.0 enable lots of variations but will also produce images that are not semantically consistent with the input, then close to 0, less noise will be added and the target image will preserve source image content. strength has an impact not only on a number of noise but also the number of generation steps. The number of denoising iterations in the image-to-image generation pipeline is calculated as `int(num_inference_steps * strength)`. With sdxl-turbo we should be careful with selecting `num_inference_steps` and `strength` to produce the correct result and make sure that the number of steps used in pipeline >= 1 after applying strength multiplication. e.g. in example below, we will use `num_inference_steps=3` and `stength=0.7`, finally, we get int(3 * 0.7) = 2 step in our pipeline." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "9eace4a5-cdd1-44d2-aced-f21a944802eb", "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8cc96faf51d8404faeedcedd55c7696e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "photo_image = image2image_pipe(\n", + "def image_to_tensor(image: Image) -> ov.Tensor:\n", + " pic = image.convert(\"RGB\")\n", + " image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)\n", + " return ov.Tensor(image_data)\n", + "\n", + "\n", + "init_image = image_to_tensor(image)\n", + "photo_prompt = \"a cute cat with bow tie\"\n", + "\n", + "photo_image_tensor = image2image_pipe.generate(\n", " photo_prompt,\n", - " image=image,\n", - " num_inference_steps=2,\n", - " generator=np.random.RandomState(511),\n", + " image=init_image,\n", + " num_inference_steps=3,\n", + " generator=ov_genai.TorchGenerator(60),\n", " guidance_scale=0.0,\n", - " strength=0.5,\n", - ").images[0]\n", + " strength=0.7,\n", + ")\n", + "photo_image = Image.fromarray(photo_image_tensor.data[0])\n", "photo_image.save(\"cat_tie.png\")\n", "photo_image" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "4d090210-f663-4a37-8819-f2f2b5c2534b", "metadata": {}, "outputs": [], @@ -438,14 +402,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "b29be9c3", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "59098071a8a14274a4d86b962fd07faf", + "model_id": "890458d1703145d29d134d74837e504d", "version_major": 2, "version_minor": 0 }, @@ -453,7 +417,7 @@ "Checkbox(value=True, description='Quantization')" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -468,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "e6fd26e3", "metadata": {}, "outputs": [], @@ -489,6 +453,36 @@ "%load_ext skip_kernel_extension" ] }, + { + "cell_type": "markdown", + "id": "fc9c2be2-8209-4dc7-8d64-ddd229ceab51", + "metadata": {}, + "source": [ + "We will create another one copy of the models and then replace `unet` model by quantized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64dc63fa-808b-48f1-a82d-040644a4c88f", + "metadata": {}, + "outputs": [], + "source": [ + "%%skip not $to_quantize.value\n", + "\n", + "import os\n", + "import shutil\n", + "\n", + "\n", + "model_dir_int8 = Path(\"./model_int8\")\n", + "UNET_INT8_OV_PATH = model_dir_int8 / \"unet\" / \"openvino_model.xml\"\n", + "\n", + "\n", + "if not model_dir_int8.exists():\n", + " shutil.copytree(model_dir, model_dir_int8)\n", + " os.remove(UNET_INT8_OV_PATH) # remove to replace by optimised" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -505,12 +499,12 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "5b82d439", "metadata": {}, "outputs": [], "source": [ - "UNET_INT8_OV_PATH = model_dir / \"optimized_unet\" / \"openvino_model.xml\"\n", + "%%skip not $to_quantize.value\n", "\n", "\n", "def disable_progress_bar(pipeline, disable=True):\n", @@ -522,7 +516,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "22471a37", "metadata": {}, "outputs": [], @@ -583,29 +577,19 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "6b62f498", "metadata": { + "scrolled": true, "test_replace": { "subset_size=200": "subset_size=10" } }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the vae_decoder to AUTO ...\n", - "Compiling the unet to AUTO ...\n", - "Compiling the text_encoder_2 to AUTO ...\n", - "Compiling the vae_encoder to AUTO ...\n", - "Compiling the text_encoder to AUTO ...\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9d546b85c6e64ec7b08d52c8b3ba6434", + "model_id": "8be41d12aefa46c3985128daaf3b75c9", "version_major": 2, "version_minor": 0 }, @@ -620,6 +604,9 @@ "source": [ "%%skip not $to_quantize.value\n", "\n", + "from optimum.intel.openvino import OVStableDiffusionXLPipeline\n", + "\n", + "\n", "if not UNET_INT8_OV_PATH.exists():\n", " text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value)\n", " unet_calibration_data = collect_calibration_data(text2image_pipe, subset_size=200)" @@ -642,14 +629,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "b112e91c", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "057ac55fc9cb4be789a42c97289010b2", + "model_id": "ae5c0353549648788157fdf66d3de388", "version_major": 2, "version_minor": 0 }, @@ -671,22 +658,16 @@ "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:nncf:Dataset contains only 200 samples, smaller than the requested subset size 300.\n" + ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c046fc48e15b49ef8fb286d368dae60c", + "model_id": "a4cb6bd6a1094ce38df290209cb42e5a", "version_major": 2, "version_minor": 0 }, @@ -707,40 +688,26 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ - "INFO:nncf:3 ignored nodes were found by name in the NNCFGraph\n", - "INFO:nncf:448 ignored nodes were found by name in the NNCFGraph\n", + "INFO:nncf:3 ignored nodes were found by names in the NNCFGraph\n", "INFO:nncf:Not adding activation input quantizer for operation: 6 __module.model.conv_in/aten::_convolution/Convolution\n", - "14 __module.model.conv_in/aten::_convolution/Add\n", + "12 __module.model.conv_in/aten::_convolution/Add\n", "\n", - "INFO:nncf:Not adding activation input quantizer for operation: 317 __module.model.up_blocks.2.resnets.2.conv_shortcut/aten::_convolution/Convolution\n", - "543 __module.model.up_blocks.2.resnets.2.conv_shortcut/aten::_convolution/Add\n", + "INFO:nncf:Not adding activation input quantizer for operation: 308 __module.model.up_blocks.2.resnets.2.conv_shortcut/aten::_convolution/Convolution\n", + "461 __module.model.up_blocks.2.resnets.2.conv_shortcut/aten::_convolution/Add\n", "\n", - "INFO:nncf:Not adding activation input quantizer for operation: 1242 __module.model.conv_out/aten::_convolution/Convolution\n", - "1426 __module.model.conv_out/aten::_convolution/Add\n", + "INFO:nncf:Not adding activation input quantizer for operation: 800 __module.model.conv_out/aten::_convolution/Convolution\n", + "885 __module.model.conv_out/aten::_convolution/Add\n", "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3ac8ad10e984401c9b23def6d84f2cca", + "model_id": "57d67a9df9754fb98588423125f0df0a", "version_major": 2, "version_minor": 0 }, @@ -762,22 +729,16 @@ "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:nncf:Dataset contains only 200 samples, smaller than the requested subset size 300.\n" + ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d564df9c3aed449abdb1a58dbe77a1fb", + "model_id": "592c9c9e2aec45e79d3f1f71060b7de4", "version_major": 2, "version_minor": 0 }, @@ -797,19 +758,6 @@ }, "metadata": {}, "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -847,44 +795,26 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "4381e145", + "execution_count": 45, + "id": "fe2bdba7-63cf-44b5-b238-f92f52542b16", + "metadata": {}, + "outputs": [], + "source": [ + "%%skip not $to_quantize.value\n", + "\n", + "text2image_pipe_int8 = ov_genai.Text2ImagePipeline(model_dir_int8, device=device.value)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "9f1ce4b8-e359-4486-9403-1851df2c5e91", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the text_encoder to AUTO ...\n", - "Compiling the text_encoder_2 to AUTO ...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e89372515f8647a9a1d5eb9547740102", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00" ] @@ -896,58 +826,43 @@ "source": [ "%%skip not $to_quantize.value\n", "\n", - "from IPython.display import display\n", - "\n", - "int8_text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value, compile=False)\n", - "int8_text2image_pipe.unet.model = core.read_model(UNET_INT8_OV_PATH)\n", - "int8_text2image_pipe.unet.request = None\n", - "\n", "prompt = \"cute cat\"\n", - "image = int8_text2image_pipe(prompt, num_inference_steps=1, height=512, width=512, guidance_scale=0.0, generator=np.random.RandomState(987)).images[0]\n", + "image_tensor = text2image_pipe_int8.generate(\n", + " prompt,\n", + " num_inference_steps=1,\n", + " height=512,\n", + " width=512,\n", + " guidance_scale=0.0,\n", + " generator=ov_genai.TorchGenerator(987),\n", + ")\n", + "image = Image.fromarray(image_tensor.data[0])\n", + "image.save(\"cat.png\")\n", "display(image)" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "3d46b9e9", + "execution_count": 47, + "id": "69caf188-b43a-48e9-929a-fb23ea741034", + "metadata": {}, + "outputs": [], + "source": [ + "%%skip not $to_quantize.value\n", + "\n", + "\n", + "image2image_pipe_int8 = ov_genai.Image2ImagePipeline(model_dir_int8, device=device.value)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "1520b899-1261-4688-b4b3-75ab30131456", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the text_encoder to AUTO ...\n", - "Compiling the text_encoder_2 to AUTO ...\n", - "Compiling the vae_encoder to AUTO ...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bfbc0015c5824b9c9e504094e5a56114", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00" ] @@ -959,15 +874,36 @@ "source": [ "%%skip not $to_quantize.value\n", "\n", - "int8_image2image_pipe = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_dir, device=device.value, compile=False)\n", - "int8_image2image_pipe.unet.model = core.read_model(UNET_INT8_OV_PATH)\n", - "int8_image2image_pipe.unet.request = None\n", "\n", + "init_image = image_to_tensor(image)\n", "photo_prompt = \"a cute cat with bow tie\"\n", - "photo_image = int8_image2image_pipe(photo_prompt, image=image, num_inference_steps=2, generator=np.random.RandomState(511), guidance_scale=0.0, strength=0.5).images[0]\n", + "\n", + "photo_image_tensor = image2image_pipe_int8.generate(\n", + " photo_prompt,\n", + " image=init_image,\n", + " num_inference_steps=3,\n", + " generator=ov_genai.TorchGenerator(60),\n", + " guidance_scale=0.0,\n", + " strength=0.7,\n", + ")\n", + "photo_image = Image.fromarray(photo_image_tensor.data[0])\n", + "photo_image.save(\"cat_tie.png\")\n", "display(photo_image)" ] }, + { + "cell_type": "code", + "execution_count": 49, + "id": "62553d8f-bb1e-49a9-9787-a8f8a205ad66", + "metadata": {}, + "outputs": [], + "source": [ + "%%skip not $to_quantize.value\n", + "\n", + "del image2image_pipe_int8\n", + "gc.collect();" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -980,7 +916,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 50, "id": "63fc61a9", "metadata": {}, "outputs": [ @@ -988,8 +924,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "FP16 model size: 5014578.62 KB\n", - "INT8 model size: 2517944.84 KB\n", + "FP16 model size: 5014578.44 KB\n", + "INT8 model size: 2517944.67 KB\n", "Model compression rate: 1.992\n" ] } @@ -997,8 +933,11 @@ "source": [ "%%skip not $to_quantize.value\n", "\n", - "fp16_ir_model_size = UNET_OV_PATH.with_suffix(\".bin\").stat().st_size / 1024\n", - "quantized_model_size = UNET_INT8_OV_PATH.with_suffix(\".bin\").stat().st_size / 1024\n", + "\n", + "UNET_OV_PATH = model_dir / \"unet\" / \"openvino_model.bin\"\n", + "UNET_INT8_OV_PATH = model_dir_int8 / \"unet\" / \"openvino_model.bin\"\n", + "fp16_ir_model_size = UNET_OV_PATH.stat().st_size / 1024\n", + "quantized_model_size = UNET_INT8_OV_PATH.stat().st_size / 1024\n", "\n", "print(f\"FP16 model size: {fp16_ir_model_size:.2f} KB\")\n", "print(f\"INT8 model size: {quantized_model_size:.2f} KB\")\n", @@ -1022,7 +961,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 51, "id": "914fcb4d", "metadata": {}, "outputs": [], @@ -1040,63 +979,64 @@ "\n", "def calculate_inference_time(pipe, dataset):\n", " inference_time = []\n", - " disable_progress_bar(pipe)\n", "\n", " for idx, prompt in enumerate(dataset):\n", " start = time.perf_counter()\n", - " image = pipe(\n", + " image_tensor = pipe.generate(\n", " prompt,\n", " num_inference_steps=1,\n", " guidance_scale=0.0,\n", - " generator=np.random.RandomState(23)\n", - " ).images[0]\n", + " generator=ov_genai.TorchGenerator(23),\n", + " )\n", " end = time.perf_counter()\n", " delta = end - start\n", " inference_time.append(delta)\n", " if idx >= validation_size:\n", " break\n", - " disable_progress_bar(pipe, disable=False)\n", " return np.median(inference_time)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 52, "id": "e46cddac", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the vae_decoder to AUTO ...\n", - "Compiling the unet to AUTO ...\n", - "Compiling the vae_encoder to AUTO ...\n", - "Compiling the text_encoder to AUTO ...\n", - "Compiling the text_encoder_2 to AUTO ...\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "FP16 pipeline latency: 1.775\n", - "INT8 pipeline latency: 0.673\n", - "Text-to-Image generation speed up: 2.636\n" + "FP16 pipeline latency: 2.387\n", + "INT8 pipeline latency: 1.846\n", + "Text-to-Image generation speed up: 1.293\n" ] } ], "source": [ "%%skip not $to_quantize.value\n", "\n", - "int8_latency = calculate_inference_time(int8_text2image_pipe, validation_data)\n", - "text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value)\n", + "int8_latency = calculate_inference_time(text2image_pipe_int8, validation_data)\n", + "text2image_pipe = ov_genai.Text2ImagePipeline(model_dir, device.value)\n", "fp_latency = calculate_inference_time(text2image_pipe, validation_data)\n", "print(f\"FP16 pipeline latency: {fp_latency:.3f}\")\n", "print(f\"INT8 pipeline latency: {int8_latency:.3f}\")\n", "print(f\"Text-to-Image generation speed up: {fp_latency / int8_latency:.3f}\")" ] }, + { + "cell_type": "code", + "execution_count": 53, + "id": "ce2df208-92ba-4677-bfd0-a20de3672c93", + "metadata": {}, + "outputs": [], + "source": [ + "%%skip not $to_quantize.value\n", + "\n", + "del text2image_pipe_int8\n", + "del text2image_pipe\n", + "gc.collect();" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -1119,14 +1059,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 54, "id": "bb0d3675", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d76c42e8912847dbadc206494f546f72", + "model_id": "aae6c237ccd641e0baa4b2d3d918216b", "version_major": 2, "version_minor": 0 }, @@ -1134,7 +1074,7 @@ "Checkbox(value=True, description='Use quantized model')" ] }, - "execution_count": 23, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1155,29 +1095,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "id": "665fe788", "metadata": {}, "outputs": [], "source": [ - "text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value)\n", "if use_quantized_model.value:\n", " if not quantized_model_present:\n", " raise RuntimeError(\"Quantized model not found.\")\n", - " text2image_pipe.unet.model = core.read_model(UNET_INT8_OV_PATH)\n", - " text2image_pipe.unet.request = core.compile_model(text2image_pipe.unet.model, device.value)\n", + " text2image_pipe = ov_genai.Text2ImagePipeline(model_dir_int8, device=device.value)\n", + "else:\n", + " text2image_pipe = ov_genai.Text2ImagePipeline(model_dir, device=device.value)\n", "\n", "\n", "def generate_from_text(text, seed, num_steps, height, width):\n", - " result = text2image_pipe(\n", + " image_tensor = text2image_pipe.generate(\n", " text,\n", " num_inference_steps=num_steps,\n", " guidance_scale=0.0,\n", - " generator=np.random.RandomState(seed),\n", + " generator=ov_genai.TorchGenerator(seed),\n", " height=height,\n", " width=width,\n", - " ).images[0]\n", - " return result" + " )\n", + " image = Image.fromarray(image_tensor.data[0])\n", + "\n", + " return image" ] }, { @@ -1223,7 +1165,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" }, "openvino_notebooks": { "imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/sdxl-turbo/sdxl-turbo.png?raw=true",