diff --git a/notebooks/stable-diffusion-xl/gradio_helper.py b/notebooks/stable-diffusion-xl/gradio_helper.py index 8ea17716856..abc086a68d5 100644 --- a/notebooks/stable-diffusion-xl/gradio_helper.py +++ b/notebooks/stable-diffusion-xl/gradio_helper.py @@ -1,19 +1,25 @@ import gradio as gr from diffusers.utils import load_image import numpy as np +from PIL import Image + +import openvino as ov +import openvino_genai as ov_genai # TODO Consider reusing make_demo_segmind_vegart def make_demo_sd_xl_text2image(pipeline): def generate_from_text(text, seed, num_steps): - result = pipeline( + image_tensor = pipeline.generate( text, num_inference_steps=num_steps, - generator=np.random.RandomState(seed), height=512, width=512, - ).images[0] - return result + generator=ov_genai.TorchGenerator(seed), + ) + image = Image.fromarray(image_tensor.data[0]) + + return image with gr.Blocks() as demo: with gr.Column(): @@ -59,13 +65,21 @@ def make_demo_sd_xl_image2image(pipeline): ) def generate_from_image(text, image, seed, num_steps): - result = pipeline( + def image_to_tensor(image: Image) -> ov.Tensor: + pic = image.convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return ov.Tensor(image_data) + + init_image = image_to_tensor(image) + photo_image_tensor = pipeline.generate( text, - image=image, + image=init_image, num_inference_steps=num_steps, - generator=np.random.RandomState(seed), - ).images[0] - return result + generator=ov_genai.TorchGenerator(seed), + ) + photo_image = Image.fromarray(photo_image_tensor.data[0]) + + return photo_image with gr.Blocks() as demo: with gr.Column(): diff --git a/notebooks/stable-diffusion-xl/stable-diffusion-xl.ipynb b/notebooks/stable-diffusion-xl/stable-diffusion-xl.ipynb index d0ba7d954d9..f1f7a5bda90 100644 --- a/notebooks/stable-diffusion-xl/stable-diffusion-xl.ipynb +++ b/notebooks/stable-diffusion-xl/stable-diffusion-xl.ipynb @@ -20,7 +20,7 @@ "\n", "In this tutorial, we consider how to run the SDXL model using OpenVINO.\n", "\n", - "We will use a pre-trained model from the [Hugging Face Diffusers](https://huggingface.co/docs/diffusers/index) library. To simplify the user experience, the [Hugging Face Optimum Intel](https://huggingface.co/docs/optimum/intel/index) library is used to convert the models to OpenVINO™ IR format.\n", + "We will use a pre-trained model from the [Hugging Face Diffusers](https://huggingface.co/docs/diffusers/index) library. To simplify the user experience, the [Hugging Face Optimum Intel](https://huggingface.co/docs/optimum/intel/index) library is used to convert the models to OpenVINO™ IR format. For running the image generation we will use [OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai) that provides easy-to-use API.\n", "\n", "The tutorial consists of the following steps:\n", "\n", @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "2ecf3e6d-cbc1-4b57-be08-2ded40f182ce", "metadata": { "tags": [] @@ -82,7 +82,8 @@ "source": [ "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \"torch>=2.1\" \"torchvision\" \"diffusers>=0.24.0\" \"invisible-watermark>=0.2.0\" \"transformers>=4.33.0\" \"accelerate\" \"onnx!=1.16.2\" \"peft>=0.6.2\"\n", "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\"\n", - "%pip install -q \"openvino>=2023.1.0\" \"gradio>=4.19\" \"nncf>=2.9.0\"" + "%pip install -q \"gradio>=4.19\" \"nncf>=2.15.0\"\n", + "%pip install -q -U --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly \"openvino>=2024.5\" \"openvino-genai>=2024.5\"" ] }, { @@ -97,32 +98,19 @@ "We will start with the base model part, which is responsible for the generation of images of the desired output size. \n", "[stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) is available for downloading via the [HuggingFace hub](https://huggingface.co/models). It already provides a ready-to-use model in OpenVINO format compatible with [Optimum Intel](https://huggingface.co/docs/optimum/intel/index).\n", "\n", - "To load an OpenVINO model and run an inference with OpenVINO Runtime, you need to replace diffusers `StableDiffusionXLPipeline` with Optimum `OVStableDiffusionXLPipeline`. In case you want to load a PyTorch model and convert it to the OpenVINO format on the fly, you can set `export=True`. \n", - "\n", - "You can save the model on disk using the `save_pretrained` method." + "We will use [OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai) that provides easy-to-use API for running image generation. Firstly we will create pipeline with `Text2ImagePipeline` and `Image2ImagePipeline`. You can see more details in [Image Python Generation Pipeline Example](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2025/0/samples/python/image_generation)\n", + "Pipeline `generate` method returns numpy array with generated content. You can convert it to PIL.Image using `Image.fromarray`." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "e16d2760-85bd-4a5f-be1b-a7313d960c56", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-10-17 22:53:35.107765: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2024-10-17 22:53:35.109501: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2024-10-17 22:53:35.146015: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-10-17 22:53:35.889441: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" - ] - } - ], + "outputs": [], "source": [ "from pathlib import Path\n", - "from optimum.intel.openvino import OVStableDiffusionXLPipeline\n", + "import openvino_genai as ov_genai\n", "import gc\n", "\n", "model_id = \"stabilityai/stable-diffusion-xl-base-1.0\"\n", @@ -136,6 +124,10 @@ " )\n", " open(\"notebook_utils.py\", \"w\").write(r.text)\n", "\n", + "if not Path(\"cmd_helper.py\").exists():\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py\")\n", + " open(\"cmd_helper.py\", \"w\").write(r.text)\n", + "\n", "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n", "from notebook_utils import collect_telemetry\n", "\n", @@ -163,7 +155,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1a2bb853a5b8444cb05d627e6f789a13", + "model_id": "72c0839d90c74888b9738fe4194fef1c", "version_major": 2, "version_minor": 0 }, @@ -202,7 +194,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "29e19cfafe5643e484f5376ab941aa2c", + "model_id": "d3d4a62a92f24eb4b1d4b22d643dfae4", "version_major": 2, "version_minor": 0 }, @@ -230,13 +222,27 @@ "cell_type": "code", "execution_count": 5, "id": "a4e9bd80-88e7-4f97-a5b3-6274f91a7165", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ + "from cmd_helper import optimum_cli\n", + "\n", + "\n", + "additional_args = {}\n", + "\n", + "if compress_weights.value:\n", + " model_dir = model_dir / \"INT8\"\n", + " additional_args.update({\"weight-format\": \"int8\"})\n", + "else:\n", + " model_dir = model_dir / \"FP16\"\n", + " additional_args.update({\"weight-format\": \"fp16\"})\n", + "\n", "if not model_dir.exists():\n", - " !optimum-cli export openvino -m stabilityai/stable-diffusion-xl-base-1.0 --weight-format int8 {model_dir}\n", + " optimum_cli(model_id, model_dir, additional_args=additional_args)\n", "\n", - "text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value)" + "text2image_pipe = ov_genai.Text2ImagePipeline(model_dir, device.value)" ] }, { @@ -248,7 +254,7 @@ "### Run Text2Image generation pipeline\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "Now, we can run the model for the generation of images using text prompts. To speed up evaluation and reduce the required memory we decrease `num_inference_steps` and image size (using `height` and `width`). You can modify them to suit your needs and depend on the target hardware. We also specified a `generator` parameter based on a numpy random state with a specific seed for results reproducibility." + "Now, we can run the model for the generation of images using text prompts. To speed up evaluation and reduce the required memory we decrease `num_inference_steps` and image size (using `height` and `width`). You can modify them to suit your needs and depend on the target hardware. We also specified a `generator` parameter `openvino_genai.TorchGenerator` with a specific seed for results reproducibility." ] }, { @@ -261,22 +267,8 @@ "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "40eab39b84504df19070913855752f09", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/25 [00:00" ] @@ -287,16 +279,19 @@ } ], "source": [ - "import torch\n", + "from PIL import Image\n", + "\n", "\n", "prompt = \"cute cat 4k, high-res, masterpiece, best quality, full hd, extremely detailed, soft lighting, dynamic angle, 35mm\"\n", - "image = text2image_pipe(\n", + "\n", + "image_tensor = text2image_pipe.generate(\n", " prompt,\n", " num_inference_steps=25,\n", " height=512,\n", " width=512,\n", - " generator=torch.Generator(device=\"cpu\").manual_seed(903512),\n", - ").images[0]\n", + " generator=ov_genai.TorchGenerator(903512),\n", + ")\n", + "image = Image.fromarray(image_tensor.data[0])\n", "image.save(\"cat.png\")\n", "image" ] @@ -327,7 +322,7 @@ "from gradio_helper import make_demo_sd_xl_text2image\n", "\n", "if text2image_pipe is None:\n", - " text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value)\n", + " text2image_pipe = ov_genai.Text2ImagePipeline(model_dir, device.value)\n", "\n", "demo = make_demo_sd_xl_text2image(text2image_pipe)\n", "\n", @@ -362,7 +357,7 @@ "### Run Image2Image generation pipeline\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "We can reuse the already converted model for running the Image2Image generation pipeline. For that, we should replace `OVStableDiffusionXLPipeline` with `OVStableDiffusionXLImage2ImagePipeline`." + "We can reuse the already converted model for running the Image2Image generation pipeline. For that, we should replace `OVStableDiffusionXLPipeline` with `Image2ImagePipeline`. Also we convert the input image to `ov.Tensor` using `image_to_tensor` function. " ] }, { @@ -386,7 +381,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1a2bb853a5b8444cb05d627e6f789a13", + "model_id": "72c0839d90c74888b9738fe4194fef1c", "version_major": 2, "version_minor": 0 }, @@ -410,9 +405,7 @@ "metadata": {}, "outputs": [], "source": [ - "from optimum.intel import OVStableDiffusionXLImg2ImgPipeline\n", - "\n", - "image2image_pipe = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_dir, device=device.value)" + "image2image_pipe = ov_genai.Image2ImagePipeline(model_dir, device=device.value)" ] }, { @@ -423,22 +416,8 @@ "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7ae07bb1bf54428f8e423f9be4d6d94b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/37 [00:00" ] @@ -449,16 +428,28 @@ } ], "source": [ - "import torch\n", + "import numpy as np\n", + "from PIL import Image\n", + "\n", + "import openvino as ov\n", + "\n", + "\n", + "def image_to_tensor(image: Image) -> ov.Tensor:\n", + " pic = image.convert(\"RGB\")\n", + " image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)\n", + " return ov.Tensor(image_data)\n", + "\n", "\n", + "init_image = image_to_tensor(image)\n", "photo_prompt = \"professional photo of a cat, extremely detailed, hyper realistic, best quality, full hd\"\n", - "photo_image = image2image_pipe(\n", + "photo_image_tensor = image2image_pipe.generate(\n", " photo_prompt,\n", - " image=image,\n", - " num_inference_steps=50,\n", + " image=init_image,\n", + " num_inference_steps=35,\n", " strength=0.75,\n", - " generator=torch.Generator(device=\"cpu\").manual_seed(4891),\n", - ").images[0]\n", + " generator=ov_genai.TorchGenerator(4891),\n", + ")\n", + "photo_image = Image.fromarray(photo_image_tensor.data[0])\n", "photo_image.save(\"photo_cat.png\")\n", "photo_image" ] @@ -487,7 +478,7 @@ "from gradio_helper import make_demo_sd_xl_image2image\n", "\n", "if image2image_pipe is None:\n", - " image2image_pipe = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_dir)\n", + " image2image_pipe = ov_genai.Image2ImagePipeline(model_dir, device=device.value)\n", "\n", "demo = make_demo_sd_xl_image2image(image2image_pipe)\n", "\n", @@ -503,28 +494,10 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "3cc2a9d6-4a39-4690-8089-fd47aecffea0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Closing server running on port: 7860\n" - ] - }, - { - "data": { - "text/plain": [ - "12351" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "demo.close()\n", "del image2image_pipe\n", @@ -548,7 +521,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" }, "openvino_notebooks": { "imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/stable-diffusion-xl/stable-diffusion-xl.png?raw=true",