diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt index 44fdcea5afd..1e9d9794dd9 100644 --- a/.ci/spellcheck/.pyspelling.wordlist.txt +++ b/.ci/spellcheck/.pyspelling.wordlist.txt @@ -527,6 +527,7 @@ notus nsamples nsfw NSFW +NuExtract num numpy NumPy diff --git a/notebooks/nuextract-structure-extraction/README.md b/notebooks/nuextract-structure-extraction/README.md new file mode 100644 index 00000000000..91bc8d91070 --- /dev/null +++ b/notebooks/nuextract-structure-extraction/README.md @@ -0,0 +1,23 @@ +# Structure Extraction with NuExtract and OpenVINO + +[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/nuextract-structure-extraction/nuextract-structure-extraction.ipynb) + +[NuExtract](https://huggingface.co/numind/NuExtract) model is a text-to-JSON Large Language Model (LLM) that allows to extract arbitrarily complex information from text and turns it into structured data. + +## Notebook Contents + +The tutorial consists of the following steps: + +- Install prerequisites +- Download and convert the model from a public source using the [OpenVINO integration with Hugging Face Optimum](https://huggingface.co/blog/openvino) +- Compress model weights to INT8 and INT4 with [OpenVINO NNCF](https://github.com/openvinotoolkit/nncf) +- Create a structure extraction inference pipeline with [Generate API](https://github.com/openvinotoolkit/openvino.genai) +- Launch interactive Gradio demo with structure extraction pipeline + +## Installation Instructions + +This is a self-contained example that relies solely on its own code.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. +For details, please refer to [Installation Guide](../../README.md). + + diff --git a/notebooks/nuextract-structure-extraction/gradio_helper.py b/notebooks/nuextract-structure-extraction/gradio_helper.py new file mode 100644 index 00000000000..514f0e0b8b6 --- /dev/null +++ b/notebooks/nuextract-structure-extraction/gradio_helper.py @@ -0,0 +1,64 @@ +import gradio as gr +from typing import Callable + +example_text = """We introduce Mistral 7B, a 7-billion-parameter language model engineered for +superior performance and efficiency. Mistral 7B outperforms the best open 13B +model (Llama 2) across all evaluated benchmarks, and the best released 34B +model (Llama 1) in reasoning, mathematics, and code generation. Our model +leverages grouped-query attention (GQA) for faster inference, coupled with sliding +window attention (SWA) to effectively handle sequences of arbitrary length with a +reduced inference cost. We also provide a model fine-tuned to follow instructions, +Mistral 7B - Instruct, that surpasses Llama 2 13B - chat model both on human and +automated benchmarks. Our models are released under the Apache 2.0 license. +Code: https://github.com/mistralai/mistral-src +Webpage: https://mistral.ai/news/announcing-mistral-7b/""" + +example_schema = """{ + "Model": { + "Name": "", + "Number of parameters": "", + "Number of max token": "", + "Architecture": [] + }, + "Usage": { + "Use case": [], + "Licence": "" + } +}""" + + +def make_demo(fn: Callable): + with gr.Blocks() as demo: + gr.Markdown("# Structure Extraction with NuExtract and OpenVINO") + + with gr.Row(): + with gr.Column(): + text_textbox = gr.Textbox( + label="Text", + placeholder="Text from which to extract information", + lines=5, + ) + schema_textbox = gr.Code( + label="JSON Schema", + language="json", + lines=5, + ) + with gr.Column(): + model_output_textbox = gr.Code( + label="Model Response", + language="json", + interactive=False, + lines=10, + ) + with gr.Row(): + gr.ClearButton(components=[text_textbox, schema_textbox, model_output_textbox]) + submit_button = gr.Button(value="Submit", variant="primary") + with gr.Row(): + gr.Examples(examples=[[example_text, example_schema]], inputs=[text_textbox, schema_textbox]) + + submit_button.click( + fn, + [text_textbox, schema_textbox], + [model_output_textbox], + ) + return demo diff --git a/notebooks/nuextract-structure-extraction/nuextract-structure-extraction.ipynb b/notebooks/nuextract-structure-extraction/nuextract-structure-extraction.ipynb new file mode 100644 index 00000000000..d4e1d77b6a3 --- /dev/null +++ b/notebooks/nuextract-structure-extraction/nuextract-structure-extraction.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "f6aa8c52-9bde-41a6-a5f7-4fa93d5c2a6c", + "metadata": {}, + "source": [ + "# Structure Extraction with NuExtract and OpenVINO\n", + "\n", + "![image](https://github.com/user-attachments/assets/70dd93cc-da36-4c53-8891-78c0f9a41f20)\n", + "\n", + "[NuExtract](https://huggingface.co/numind/NuExtract) model is a text-to-JSON Large Language Model (LLM) that allows to extract arbitrarily complex information from text and turns it into structured data.\n", + "\n", + "LLM stands for “Large Language Model” which refers to a type of artificial intelligence model that is designed to understand and generate human-like text based on the input it receives. LLMs are trained on large datasets of text to learn patterns, grammar, and semantic relationships, allowing them to generate coherent and contextually relevant responses. One core capability of Large Language Models (LLMs) is to follow natural language instructions. Instruction-following models are capable of generating text in response to prompts and are often used for tasks like writing assistance, chatbots, and content generation.\n", + "\n", + "In this tutorial, we consider how to run a structure extraction text generation pipeline using NuExtract model and OpenVINO. We will use pre-trained models from the [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library. The [Hugging Face Optimum Intel](https://huggingface.co/docs/optimum/intel/index) library converts the models to OpenVINO™ IR format. To simplify the user experience, we will use [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai) for generation inference pipeline. \n", + "\n", + "The tutorial consists of the following steps:\n", + "\n", + "- Install prerequisites\n", + "- Download and convert the model from a public source using the [OpenVINO integration with Hugging Face Optimum](https://huggingface.co/blog/openvino)\n", + "- Compress model weights to INT8 and INT4 with [OpenVINO NNCF](https://github.com/openvinotoolkit/nncf)\n", + "- Create a structure extraction inference pipeline with [Generate API](https://github.com/openvinotoolkit/openvino.genai)\n", + "- Launch interactive Gradio demo with structure extraction pipeline\n", + "\n", + "\n", + "#### Table of contents:\n", + "\n", + "- [Prerequisites](#Prerequisites)\n", + "- [Select model for inference](#Select-model-for-inference)\n", + "- [Download and convert model to OpenVINO IR via Optimum Intel CLI](#Download-and-convert-model-to-OpenVINO-IR-via-Optimum-Intel-CLI)\n", + "- [Compress model weights](#Compress-model-weights)\n", + " - [Weights Compression using Optimum Intel CLI](#weights-compression-using-optimum-intel-cli)\n", + "- [Select device for inference and model variant](#Select-device-for-inference-and-model-variant)\n", + "- [Create a structure extraction inference pipeline](#Create-a-structure-extraction-inference-pipeline)\n", + "- [Run interactive structure extraction demo with Gradio](#Run-interactive-structure-extraction-demo-with-Gradio)\n", + "\n", + "\n", + "### Installation Instructions\n", + "\n", + "This is a self-contained example that relies solely on its own code.\n", + "\n", + "We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.\n", + "For details, please refer to [Installation Guide](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/README.md#-installation-guide).\n", + "\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "027108c2-1fbe-4be5-9e23-3fc359185a42", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "[back to top ⬆️](#Table-of-contents:)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d0473c6-3734-422d-a370-2e39d576be0e", + "metadata": {}, + "outputs": [], + "source": [ + "%pip uninstall -q -y optimum optimum-intel\n", + "%pip install -Uq \"openvino>=2024.3.0\" \"openvino-genai\"\n", + "%pip install -q \"torch>=2.1\" \"nncf>=2.12\" \"transformers>=4.40.0\" \"accelerate\" \"gradio>=4.19\" \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "48ded239", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "import requests\n", + "import shutil\n", + "\n", + "if not Path(\"notebook_utils.py\").exists():\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n", + " open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "\n", + "from notebook_utils import download_file\n", + "\n", + "# Fetch llm_config.py\n", + "llm_config_shared_path = Path(\"../../utils/llm_config.py\")\n", + "llm_config_dst_path = Path(\"llm_config.py\")\n", + "\n", + "if not llm_config_dst_path.exists():\n", + " if llm_config_shared_path.exists():\n", + " try:\n", + " os.symlink(llm_config_shared_path, llm_config_dst_path)\n", + " except Exception:\n", + " shutil.copy(llm_config_shared_path, llm_config_dst_path)\n", + " else:\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py\")\n", + "elif not os.path.islink(llm_config_dst_path):\n", + " print(\"LLM config will be updated\")\n", + " if llm_config_shared_path.exists():\n", + " shutil.copy(llm_config_shared_path, llm_config_dst_path)\n", + " else:\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "611cc777-d5bc-4c7b-92e4-a4befa13b2ce", + "metadata": {}, + "source": [ + "## Select model for inference\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "The tutorial supports different models, you can select one from the provided options to compare the quality of open source solutions.\n", + ">**Note**: conversion of some models can require additional actions from user side and at least 64GB RAM for conversion.\n", + "\n", + "NuExtract model has several versions:\n", + "\n", + "* **NuExtract-tiny** - This is a version of [Qwen1.5-0.5](https://huggingface.co/Qwen/Qwen1.5-0.5B) model with 0.5 billion parameters. More details about the model can be found in [model card](https://huggingface.co/numind/NuExtract-tiny).\n", + "* **NuExtract** - This is a version of [phi-3-mini](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model with 3.8 billion parameters. More details about the model can be found in [model card](https://huggingface.co/numind/NuExtract).\n", + "* **NuExtract-large** - This is a version of [phi-3-small](https://huggingface.co/microsoft/Phi-3-small-8k-instruct) model with 7 billion parameters. More details about the model can be found in [model card](https://huggingface.co/numind/NuExtract-large).\n", + "\n", + "All NuExtract models are fine-tuned on a private high-quality synthetic dataset for information extraction." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "27b42290-a9b5-4453-9a4c-ffa44bbd966d", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5e56b491b11440b49baecb0bbfdd6657", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Box(children=(Box(children=(Label(value='Model:'), Dropdown(options={'NuExtract_tiny': {'model_id': 'numind/Nu…" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from llm_config import get_llm_selection_widget\n", + "\n", + "models = {\n", + " \"NuExtract_tiny\": {\"model_id\": \"numind/NuExtract-tiny\"},\n", + " \"NuExtract\": {\"model_id\": \"numind/NuExtract\"},\n", + " \"NuExtract_large\": {\"model_id\": \"numind/NuExtract-large\"},\n", + "}\n", + "\n", + "form, _, model_dropdown, compression_dropdown, _ = get_llm_selection_widget(languages=None, models=models, show_preconverted_checkbox=False)\n", + "\n", + "form" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "37e9634f-4fc7-4d9c-9ade-b3e8684a0828", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected model NuExtract_tiny with INT4 compression\n" + ] + } + ], + "source": [ + "model_name = model_dropdown.label\n", + "model_config = model_dropdown.value\n", + "print(f\"Selected model {model_name} with {compression_dropdown.value} compression\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4e4fd394-b4fb-4eef-8bdc-d116572aa8f0", + "metadata": {}, + "source": [ + "## Download and convert model to OpenVINO IR via Optimum Intel CLI\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Listed model are available for downloading via the [HuggingFace hub](https://huggingface.co/models). We will use optimum-cli interface for exporting it into OpenVINO Intermediate Representation (IR) format.\n", + "\n", + "Optimum CLI interface for converting models supports export to OpenVINO (supported starting optimum-intel 1.12 version).\n", + "General command format:\n", + "\n", + "```bash\n", + "optimum-cli export openvino --model --task \n", + "```\n", + "\n", + "where `--model` argument is model id from HuggingFace Hub or local directory with model (saved using `.save_pretrained` method), `--task ` is one of [supported task](https://huggingface.co/docs/optimum/exporters/task_manager) that exported model should solve. If `--task` is not specified, the task will be auto-inferred based on the model. If model initialization requires to use remote code, `--trust-remote-code` flag additionally should be passed. Full list of supported arguments available via `--help` For more details and examples of usage, please check [optimum documentation](https://huggingface.co/docs/optimum/intel/inference#export).\n", + "\n", + "## Compress model weights\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "The Weights Compression algorithm is aimed at compressing the weights of the models and can be used to optimize the model footprint and performance of large models where the size of weights is relatively larger than the size of activations, for example, Large Language Models (LLM). Compared to INT8 compression, INT4 compression improves performance even more but introduces a minor drop in prediction quality.\n", + "\n", + "\n", + "### Weights Compression using Optimum Intel CLI\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Optimum Intel supports weight compression via NNCF out of the box. For 8-bit compression we pass `--weight-format int8` to `optimum-cli` command line. For 4 bit compression we provide `--weight-format int4` and some other options containing number of bits and other compression parameters. An example of this approach usage you can find in [llm-chatbot notebook](../llm-chatbot)\n", + "\n", + "\n", + ">**Note**: This tutorial involves conversion model for FP16 and INT4/INT8 weights compression scenarios. It may be memory and time-consuming in the first run. You can manually control the compression precision below.\n", + ">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f81602ca-4674-4b61-b2c8-ca11631428b1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⌛ NuExtract_tiny conversion to INT4 started. It may takes some time.\n" + ] + }, + { + "data": { + "text/markdown": [ + "**Export command:**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "`optimum-cli export openvino --model numind/NuExtract-tiny --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8 NuExtract_tiny/INT4_compressed_weights`" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Framework not specified. Using pt to export the model.\n", + "Using framework PyTorch: 2.3.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)\n", + "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/optimum/exporters/openvino/model_patcher.py:489: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if sequence_length != 1:\n", + "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py:110: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if seq_len > self.max_seq_len_cached:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2KMixed-Precision assignment \u001b[90m━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[36m168/168\u001b[0m • \u001b[36m0:00:01\u001b[0m • \u001b[36m0:00:00\u001b[0m• \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hINFO:nncf:Statistics of the bitwidth distribution:\n", + "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n", + "│ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n", + "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n", + "│ 8 │ 47% (47 / 169) │ 20% (46 / 168) │\n", + "├────────────────┼─────────────────────────────┼────────────────────────���───────────────┤\n", + "│ 4 │ 53% (122 / 169) │ 80% (122 / 168) │\n", + "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n", + "\u001b[2KApplying Weight Compression \u001b[90m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[36m169/169\u001b[0m • \u001b[36m0:00:05\u001b[0m • \u001b[36m0:00:00\u001b[0m• \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Set tokenizer padding side to left for `text-generation-with-past` task.\n", + "Replacing `(?!\\S)` pattern to `(?:$|[^\\S])` in RegexSplit operation\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ INT4 NuExtract_tiny model converted and can be found in NuExtract_tiny/INT4_compressed_weights\n" + ] + } + ], + "source": [ + "from llm_config import convert_and_compress_model\n", + "\n", + "model_dir = convert_and_compress_model(model_name, model_config, compression_dropdown.value, use_preconverted=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "60355f86-2250-4ebe-82ac-950f2d4fb01b", + "metadata": {}, + "source": [ + "Let's compare model size for different compression types" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "42c7b254-1ce4-4f23-813a-9bdc23aed327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of model with INT4 compressed weights is 347.03 MB\n" + ] + } + ], + "source": [ + "from llm_config import compare_model_size\n", + "\n", + "compare_model_size(model_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3df73379-bccc-41b1-9c94-c3040819805b", + "metadata": {}, + "source": [ + "## Select device for inference and model variant\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + ">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d2d7bf5b-8a05-4c3b-a36b-631af5c197e9", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fa8a102d350b487cb3e0a4cb397295e6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from notebook_utils import device_widget\n", + "\n", + "device = device_widget(default=\"CPU\", exclude=[\"NPU\"])\n", + "\n", + "device" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bf13f6c3-6671-408e-ae0e-aaa3d8a6eaac", + "metadata": {}, + "source": [ + "## Create a structure extraction inference pipeline\n", + "[back to top ⬆️](#Table-of-contents:)\n", + " \n", + "Firstly we will prepare input prompt for NuExtract model by introducing `prepare_input()` function. This function combines the main text, a JSON schema and optional examples into a single string that adheres to model's specific input requirements.\n", + "\n", + "`prepare_input()` function accepts the following parameters:\n", + "1. `text`: This is the primary text from which you want to extract information.\n", + "2. `schema`: A JSON schema string that defines the structure of the information you want to extract. This acts as a template, guiding NuExtract model on what data to look for and how to format the output.\n", + "3. `examples`: An optional list of example strings. These can be used to provide the model with sample extractions, potentially improving accuracy for complex or ambiguous cases." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "14d72874", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from typing import List\n", + "\n", + "\n", + "def prepare_input(text: str, schema: str, examples: List[str] = [\"\", \"\", \"\"]) -> str:\n", + " schema = json.dumps(json.loads(schema), indent=4)\n", + " input_llm = \"<|input|>\\n### Template:\\n\" + schema + \"\\n\"\n", + " for example in examples:\n", + " if example != \"\":\n", + " input_llm += \"### Example:\\n\" + json.dumps(json.loads(example), indent=4) + \"\\n\"\n", + "\n", + " input_llm += \"### Text:\\n\" + text + \"\\n<|output|>\\n\"\n", + " return input_llm" + ] + }, + { + "cell_type": "markdown", + "id": "d41b78dd", + "metadata": {}, + "source": [ + "To simplify user experience we will use [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md).\n", + "We will create pipeline with `LLMPipeline`. `LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the `main model`, `tokenizer`, `detokenizer` and default `generation configuration`. \n", + "After that we will configure parameters for decoding. We can get default config with `get_generation_config()`, setup parameters and apply the updated version with `set_generation_config(config)` or put config directly to `generate()`. It's also possible to specify the needed options just as inputs in the `generate()` method, as shown below.\n", + "Then we just run `generate` method and get the output in text format. We do not need to encode input prompt according to model expected template or write post-processing code for logits decoder, it will be done easily with LLMPipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f1f295d4", + "metadata": {}, + "outputs": [], + "source": [ + "from openvino_genai import LLMPipeline\n", + "\n", + "pipe = LLMPipeline(model_dir.as_posix(), device.value)\n", + "\n", + "\n", + "def run_structure_extraction(text: str, schema: str) -> str:\n", + " input = prepare_input(text, schema)\n", + " return pipe.generate(input, max_new_tokens=200)" + ] + }, + { + "cell_type": "markdown", + "id": "4820307d", + "metadata": {}, + "source": [ + "To run structure extraction inference pipeline we need to provide example text for data extraction and define output structure in a JSON schema format:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "38925684", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Model\": {\n", + " \"Name\": \"Mistral 7B\",\n", + " \"Number of parameters\": \"7-billion\",\n", + " \"Number of max token\": \"\",\n", + " \"Architecture\": [\n", + " \"grouped-query attention\",\n", + " \"sliding window attention\"\n", + " ]\n", + " },\n", + " \"Usage\": {\n", + " \"Use case\": [\n", + " \"reasoning\",\n", + " \"mathematics\",\n", + " \"code generation\"\n", + " ],\n", + " \"Licence\": \"Apache 2.0\"\n", + " }\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "text = \"\"\"We introduce Mistral 7B, a 7-billion-parameter language model engineered for\n", + "superior performance and efficiency. Mistral 7B outperforms the best open 13B\n", + "model (Llama 2) across all evaluated benchmarks, and the best released 34B\n", + "model (Llama 1) in reasoning, mathematics, and code generation. Our model\n", + "leverages grouped-query attention (GQA) for faster inference, coupled with sliding\n", + "window attention (SWA) to effectively handle sequences of arbitrary length with a\n", + "reduced inference cost. We also provide a model fine-tuned to follow instructions,\n", + "Mistral 7B - Instruct, that surpasses Llama 2 13B - chat model both on human and\n", + "automated benchmarks. Our models are released under the Apache 2.0 license.\n", + "Code: https://github.com/mistralai/mistral-src\n", + "Webpage: https://mistral.ai/news/announcing-mistral-7b/\"\"\"\n", + "\n", + "schema = \"\"\"{\n", + " \"Model\": {\n", + " \"Name\": \"\",\n", + " \"Number of parameters\": \"\",\n", + " \"Number of max token\": \"\",\n", + " \"Architecture\": []\n", + " },\n", + " \"Usage\": {\n", + " \"Use case\": [],\n", + " \"Licence\": \"\"\n", + " }\n", + "}\"\"\"\n", + "\n", + "output = run_structure_extraction(text, schema)\n", + "print(output)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "31ebb167-0e55-4271-aedd-13814c2356d2", + "metadata": {}, + "source": [ + "## Run interactive structure extraction demo with Gradio\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f222d02-847a-490f-8d66-02608a53259b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if not Path(\"gradio_helper.py\").exists():\n", + " r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/nuextract-structure-extraction/gradio_helper.py\"\n", + " )\n", + " open(\"gradio_helper.py\", \"w\").write(r.text)\n", + "\n", + "from gradio_helper import make_demo\n", + "\n", + "demo = make_demo(fn=run_structure_extraction)\n", + "\n", + "try:\n", + " demo.launch(height=800)\n", + "except Exception:\n", + " demo.launch(share=True, height=800)\n", + "# If you are launching remotely, specify server_name and server_port\n", + "# EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')`\n", + "# To learn more please refer to the Gradio docs: https://gradio.app/docs/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "405f3117", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment and run this cell for stopping gradio interface\n", + "# demo.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + }, + "openvino_notebooks": { + "imageUrl": "https://github.com/user-attachments/assets/70dd93cc-da36-4c53-8891-78c0f9a41f20", + "tags": { + "categories": [ + "Model Demos", + "AI Trends" + ], + "libraries": [], + "other": [ + "LLM" + ], + "tasks": [ + "Text Generation" + ] + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/utils/llm_config.py b/utils/llm_config.py index ac3d12f589d..893e671c01a 100644 --- a/utils/llm_config.py +++ b/utils/llm_config.py @@ -515,31 +515,37 @@ def get_optimum_cli_command(model_id, weight_format, output_dir, compression_opt SUPPORTED_OPTIMIZATIONS = ["INT4", "INT4-AWQ", "INT8", "FP16"] -def get_llm_selection_widget(): +def get_llm_selection_widget(languages=list(SUPPORTED_LLM_MODELS), models=SUPPORTED_LLM_MODELS[default_language], show_preconverted_checkbox=True): import ipywidgets as widgets - lang_drop_down = widgets.Dropdown(options=list(SUPPORTED_LLM_MODELS)) + lang_dropdown = widgets.Dropdown(options=languages or []) # Define dependent drop down - model_drop_down = widgets.Dropdown(options=(SUPPORTED_LLM_MODELS[default_language])) + model_dropdown = widgets.Dropdown(options=models) def dropdown_handler(change): global default_language default_language = change.new # If statement checking on dropdown value and changing options of the dependent dropdown accordingly - model_drop_down.options = SUPPORTED_LLM_MODELS[change.new] + model_dropdown.options = SUPPORTED_LLM_MODELS[change.new] - lang_drop_down.observe(dropdown_handler, names="value") - compression_drop_down = widgets.Dropdown(options=SUPPORTED_OPTIMIZATIONS) - preconverted = widgets.Checkbox(value=True) + lang_dropdown.observe(dropdown_handler, names="value") + compression_dropdown = widgets.Dropdown(options=SUPPORTED_OPTIMIZATIONS) + preconverted_checkbox = widgets.Checkbox(value=True) - form_items = [ - widgets.Box([widgets.Label(value="Language:"), lang_drop_down]), - widgets.Box([widgets.Label(value="Model:"), model_drop_down]), - widgets.Box([widgets.Label(value="Compression:"), compression_drop_down]), - widgets.Box([widgets.Label(value="Use preconverted models:"), preconverted]), - ] + form_items = [] + + if languages: + form_items.append(widgets.Box([widgets.Label(value="Language:"), lang_dropdown])) + form_items.extend( + [ + widgets.Box([widgets.Label(value="Model:"), model_dropdown]), + widgets.Box([widgets.Label(value="Compression:"), compression_dropdown]), + ] + ) + if show_preconverted_checkbox: + form_items.append(widgets.Box([widgets.Label(value="Use preconverted models:"), preconverted_checkbox])) form = widgets.Box( form_items, @@ -552,7 +558,7 @@ def dropdown_handler(change): padding="1%", ), ) - return form, lang_drop_down, model_drop_down, compression_drop_down, preconverted + return form, lang_dropdown, model_dropdown, compression_dropdown, preconverted_checkbox def convert_tokenizer(model_id, remote_code, model_dir):