diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml index 92943bc0c9d..43052e17b03 100644 --- a/.ci/skipped_notebooks.yml +++ b/.ci/skipped_notebooks.yml @@ -556,3 +556,9 @@ - macos-13 - ubuntu-22.04 - windows-2019 +- notebook: notebooks/llm-agent-mcp/llm-agent-mcp.ipynb + skips: + - os: + - macos-13 + - ubuntu-22.04 + - windows-2019 \ No newline at end of file diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt index f823bd40693..0974195a4f3 100644 --- a/.ci/spellcheck/.pyspelling.wordlist.txt +++ b/.ci/spellcheck/.pyspelling.wordlist.txt @@ -549,6 +549,7 @@ matplotlib MathVista MatMul MBs +MCP md MediaPipe medprob diff --git a/notebooks/llm-agent-mcp/README.md b/notebooks/llm-agent-mcp/README.md new file mode 100644 index 00000000000..27aebe0a934 --- /dev/null +++ b/notebooks/llm-agent-mcp/README.md @@ -0,0 +1,33 @@ +# Create MCP Agent using OpenVINO and Qwen-Agent + +MCP is an open protocol that standardizes how applications provide context to LLMs. Think of MCP like a USB-C port for AI applications. Just as USB-C provides a standardized way to connect your devices to various peripherals and accessories, MCP provides a standardized way to connect AI models to different data sources and tools. + +MCP helps you build agents and complex workflows on top of LLMs. LLMs frequently need to integrate with data and tools, and MCP provides: + +- A growing list of pre-built integration that your LLM can directly plug into +- The flexibility to switch between LLM providers and vendors +- Best practices for securing your data within your infrastructure + +![Image](https://github.com/user-attachments/assets/dfe1aa42-cae9-4356-be81-f010462d78a8) + +[Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) is a framework for developing LLM applications based on the instruction following, tool usage, planning, and memory capabilities of Qwen. It also comes with example applications such as Browser Assistant, Code Interpreter, and Custom Assistant. + +This notebook explores how to create a MCP Agent step by step using OpenVINO and Qwen-Agent. + +### Notebook Contents + +The tutorial consists of the following steps: + +- Install prerequisites +- Download and convert the model from a public source using the [OpenVINO integration with Hugging Face Optimum](https://huggingface.co/blog/openvino). +- Compress model weights to INT4 or INT8 precision using [NNCF](https://github.com/openvinotoolkit/nncf) +- Create an Agent +- Interactive Demo + + +## Installation Instructions + +This is a self-contained example that relies solely on its own code.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. +For details, please refer to [Installation Guide](../../README.md). + diff --git a/notebooks/llm-agent-mcp/gradio_helper.py b/notebooks/llm-agent-mcp/gradio_helper.py new file mode 100644 index 00000000000..7cdf45760ba --- /dev/null +++ b/notebooks/llm-agent-mcp/gradio_helper.py @@ -0,0 +1,141 @@ +import os +from typing import List +from qwen_agent.gui.utils import convert_history_to_chatbot +from qwen_agent.llm.schema import Message +from qwen_agent.gui import WebUI + + +class OpenVINOUI(WebUI): + def run( + self, + messages: List[Message] = None, + share: bool = False, + server_name: str = None, + server_port: int = None, + concurrency_limit: int = 10, + enable_mention: bool = False, + **kwargs + ): + self.run_kwargs = kwargs + + from qwen_agent.gui.gradio_dep import gr, mgr, ms + + customTheme = gr.themes.Default( + primary_hue=gr.themes.utils.colors.blue, + radius_size=gr.themes.utils.sizes.radius_none, + ) + + with gr.Blocks( + css=os.path.join(os.path.dirname(__file__), "assets/appBot.css"), + theme=customTheme, + ) as demo: + history = gr.State([]) + with ms.Application(): + with gr.Row(elem_classes="container"): + with gr.Column(scale=4): + chatbot = mgr.Chatbot( + value=convert_history_to_chatbot(messages=messages), + avatar_images=[ + self.user_config, + self.agent_config_list, + ], + height=850, + avatar_image_width=80, + flushing=False, + show_copy_button=True, + latex_delimiters=[ + {"left": "\\(", "right": "\\)", "display": True}, + {"left": "\\begin{equation}", "right": "\\end{equation}", "display": True}, + {"left": "\\begin{align}", "right": "\\end{align}", "display": True}, + {"left": "\\begin{alignat}", "right": "\\end{alignat}", "display": True}, + {"left": "\\begin{gather}", "right": "\\end{gather}", "display": True}, + {"left": "\\begin{CD}", "right": "\\end{CD}", "display": True}, + {"left": "\\[", "right": "\\]", "display": True}, + ], + ) + + input = mgr.MultimodalInput( + placeholder=self.input_placeholder, + ) + audio_input = gr.Audio(sources=["microphone"], type="filepath") + + with gr.Column(scale=1): + if len(self.agent_list) > 1: + agent_selector = gr.Dropdown( + [(agent.name, i) for i, agent in enumerate(self.agent_list)], + label="Agents", + info="Select an Agent", + value=0, + interactive=True, + ) + + agent_info_block = self._create_agent_info_block() + + agent_plugins_block = self._create_agent_plugins_block() + + if self.prompt_suggestions: + gr.Examples( + label="Example", + examples=self.prompt_suggestions, + inputs=[input], + ) + + if len(self.agent_list) > 1: + agent_selector.change( + fn=self.change_agent, + inputs=[agent_selector], + outputs=[agent_selector, agent_info_block, agent_plugins_block], + queue=False, + ) + + input_promise = input.submit( + fn=self.add_text, + inputs=[input, audio_input, chatbot, history], + outputs=[input, audio_input, chatbot, history], + queue=False, + ) + + if len(self.agent_list) > 1 and enable_mention: + input_promise = input_promise.then( + self.add_mention, + [chatbot, agent_selector], + [chatbot, agent_selector], + ).then( + self.agent_run, + [chatbot, history, agent_selector], + [chatbot, history, agent_selector], + ) + else: + input_promise = input_promise.then( + self.agent_run, + [chatbot, history], + [chatbot, history], + ) + + input_promise.then(self.flushed, None, [input]) + + demo.load(None) + + demo.queue(default_concurrency_limit=concurrency_limit).launch(share=share, server_name=server_name, server_port=server_port) + + def _create_agent_plugins_block(self, agent_index=0): + from qwen_agent.gui.gradio_dep import gr + + agent_interactive = self.agent_list[agent_index] + + if agent_interactive.function_map: + capabilities = [key for key in agent_interactive.function_map.keys()] + return gr.CheckboxGroup( + label="Plugins", + value=capabilities, + choices=capabilities, + interactive=False, + ) + + else: + return gr.CheckboxGroup( + label="Plugins", + value=[], + choices=[], + interactive=False, + ) diff --git a/notebooks/llm-agent-mcp/llm-agent-mcp.ipynb b/notebooks/llm-agent-mcp/llm-agent-mcp.ipynb new file mode 100644 index 00000000000..2ecf57a9de9 --- /dev/null +++ b/notebooks/llm-agent-mcp/llm-agent-mcp.ipynb @@ -0,0 +1,635 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "b3c04f1a", + "metadata": {}, + "source": [ + "# Create MCP Agent using OpenVINO and Qwen-Agent\n", + "\n", + "MCP is an open protocol that standardizes how applications provide context to LLMs. Think of MCP like a USB-C port for AI applications. Just as USB-C provides a standardized way to connect your devices to various peripherals and accessories, MCP provides a standardized way to connect AI models to different data sources and tools.\n", + "\n", + "MCP helps you build agents and complex workflows on top of LLMs. LLMs frequently need to integrate with data and tools, and MCP provides:\n", + "\n", + "- A growing list of pre-built integration that your LLM can directly plug into\n", + "- The flexibility to switch between LLM providers and vendors\n", + "- Best practices for securing your data within your infrastructure\n", + "\n", + "![Image](https://github.com/user-attachments/assets/dfe1aa42-cae9-4356-be81-f010462d78a8)\n", + "\n", + "[Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) is a framework for developing LLM applications based on the instruction following, tool usage, planning, and memory capabilities of Qwen. It also comes with example applications such as Browser Assistant, Code Interpreter, and Custom Assistant.\n", + "\n", + "This notebook explores how to create a MCP Agent step by step using OpenVINO and Qwen-Agent.\n", + "\n", + "#### Table of contents:\n", + "\n", + "- [Prerequisites](#Prerequisites)\n", + "- [Select device for inference](#Select-device-for-inference)\n", + "- [Select model for inference](#Select-model-for-inference)\n", + "- [Convert model using Optimum-CLI tool](#Convert-model-using-Optimum-CLI-tool)\n", + " - [Weights Compression using Optimum-CLI](#Weights-Compression-using-Optimum-CLI)\n", + "- [Create an Agent](#Create-An-Agent)\n", + "- [Interactive Demo](#Interactive-Demo)\n", + "\n", + "### Installation Instructions\n", + "\n", + "This is a self-contained example that relies solely on its own code.\n", + "\n", + "We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.\n", + "For details, please refer to [Installation Guide](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/README.md#-installation-guide).\n", + "\n", + "\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f7bb0a67", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "[back to top ⬆️](#Table-of-contents:)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47d43de7-9946-482d-84cb-222294c1cda8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "import os\n", + "from pathlib import Path\n", + "import requests\n", + "\n", + "os.environ[\"GIT_CLONE_PROTECTION_ACTIVE\"] = \"false\"\n", + "\n", + "%pip install -Uq pip\n", + "%pip uninstall -q -y optimum optimum-intel\n", + "%pip install --pre -Uq openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n", + "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \\\n", + "\"torch>=2.1\" \"datasets\" \"accelerate\" \"transformers>=4.51.0\" \"mcp-server-time\" \"mcp-server-fetch\"\n", + "\"pydantic==2.9.2\" \"pydantic-core==2.23.4\" \"gradio>=5.0.0\" \"gradio-client==1.4.0\" \"modelscope_studio==1.0.0-beta.8\"\n", + "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \\\n", + "\"git+https://github.com/huggingface/optimum-intel.git\"\n", + "%pip install -q \"git+https://github.com/openvinotoolkit/nncf.git\"\n", + " \n", + "utility_files = [\"notebook_utils.py\", \"cmd_helper.py\"]\n", + "\n", + "for utility in utility_files:\n", + " local_path = Path(utility)\n", + " if not local_path.exists():\n", + " r = requests.get(\n", + " url=f\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{local_path.name}\",\n", + " )\n", + " with local_path.open(\"w\") as f:\n", + " f.write(r.text)\n", + "\n", + "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n", + "from notebook_utils import collect_telemetry\n", + "\n", + "collect_telemetry(\"llm-agent-mcp.ipynb\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fc895e6-b641-4b6b-b366-12663bcde4a1", + "metadata": {}, + "outputs": [], + "source": [ + "from cmd_helper import clone_repo\n", + "\n", + "clone_repo(\"https://github.com/openvino-dev-samples/Qwen-Agent.git\", revision=\"ov-genai\")\n", + "\n", + "%pip install -q -e ./Qwen-Agent/\"[gui,code_interpreter,mcp]\"" + ] + }, + { + "cell_type": "markdown", + "id": "bf6896f0", + "metadata": {}, + "source": [ + "## Select device for inference\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a81c3078", + "metadata": {}, + "outputs": [], + "source": [ + "from notebook_utils import device_widget\n", + "\n", + "device = device_widget(default=\"CPU\")\n", + "\n", + "device" + ] + }, + { + "cell_type": "markdown", + "id": "5147bfdb", + "metadata": {}, + "source": [ + "## Select model for inference\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Large Language Models (LLMs) are a core component of Agent. In this example, we will demonstrate how to create a OpenVINO LLM model in Qwen-Agent framework. Since Qwen3 can support function calling during text generation, we select `Qwen/Qwen3-8B` as LLM in agent pipeline.\n", + "\n", + "* **Qwen/Qwen3-8B** - Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. [Model Card](https://huggingface.co/Qwen/Qwen3-8B)\n", + "* **Qwen/Qwen3-4B** - Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. [Model Card](https://huggingface.co/Qwen/Qwen3-4B)\n", + "\n", + "\n", + "[Weight compression](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html) is a technique for enhancing the efficiency of models, especially those with large memory requirements. This method reduces the model’s memory footprint, a crucial factor for Large Language Models (LLMs). We provide several options for model weight compression:\n", + "\n", + "* **FP16** reducing model binary size on disk using `save_model` with enabled compression weights to FP16 precision. This approach is available in OpenVINO from scratch and is the default behavior.\n", + "* **INT8** is an 8-bit weight-only quantization provided by [NNCF](https://github.com/openvinotoolkit/nncf): This method compresses weights to an 8-bit integer data type, which balances model size reduction and accuracy, making it a versatile option for a broad range of applications.\n", + "* **INT4** is an 4-bit weight-only quantization provided by [NNCF](https://github.com/openvinotoolkit/nncf). involves quantizing weights to an unsigned 4-bit integer symmetrically around a fixed zero point of eight (i.e., the midpoint between zero and 15). in case of **symmetric quantization** or asymmetrically with a non-fixed zero point, in case of **asymmetric quantization** respectively. Compared to INT8 compression, INT4 compression improves performance even more, but introduces a minor drop in prediction quality. INT4 it ideal for situations where speed is prioritized over an acceptable trade-off against accuracy.\n", + "* **INT4 AWQ** is an 4-bit activation-aware weight quantization. [Activation-aware Weight Quantization](https://arxiv.org/abs/2306.00978) (AWQ) is an algorithm that tunes model weights for more accurate INT4 compression. It slightly improves generation quality of compressed LLMs, but requires significant additional time for tuning weights on a calibration dataset. We will use `wikitext-2-raw-v1/train` subset of the [Wikitext](https://huggingface.co/datasets/Salesforce/wikitext) dataset for calibration.\n", + "* **INT4 NPU-friendly** is an 4-bit channel-wise quantization. This approach is [recommended](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/genai-guide-npu.html) for LLM inference using NPU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb5986e7", + "metadata": {}, + "outputs": [], + "source": [ + "from llm_config import get_llm_selection_widget\n", + "\n", + "form, lang, model_id_widget, compression_variant, use_preconverted = get_llm_selection_widget(device=device.value)\n", + "\n", + "form" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cbe2438", + "metadata": {}, + "outputs": [], + "source": [ + "model_configuration = model_id_widget.value\n", + "model_id = model_id_widget.label\n", + "print(f\"Selected model {model_id} with {compression_variant.value} compression\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "259e1f0d", + "metadata": {}, + "source": [ + "## Convert model using Optimum-CLI tool\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) is the interface between the 🤗 [Transformers](https://huggingface.co/docs/transformers/index) and [Diffusers](https://huggingface.co/docs/diffusers/index) libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. It provides ease-to-use cli interface for exporting models to [OpenVINO Intermediate Representation (IR)](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) format.\n", + "\n", + "
\n", + " Click here to read more about Optimum CLI usage\n", + "\n", + "The command bellow demonstrates basic command for model export with `optimum-cli`\n", + "\n", + "```\n", + "optimum-cli export openvino --model --task \n", + "```\n", + "\n", + "where `--model` argument is model id from HuggingFace Hub or local directory with model (saved using `.save_pretrained` method), `--task ` is one of [supported task](https://huggingface.co/docs/optimum/exporters/task_manager) that exported model should solve. For LLMs it is recommended to use `text-generation-with-past`. If model initialization requires to use remote code, `--trust-remote-code` flag additionally should be passed.\n", + "
\n", + "\n", + "### Weights Compression using Optimum-CLI\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "You can also apply fp16, 8-bit or 4-bit weight compression on the Linear, Convolutional and Embedding layers when exporting your model with the CLI. \n", + "
\n", + " Click here to read more about weights compression with Optimum CLI\n", + "\n", + "Setting `--weight-format` to respectively fp16, int8 or int4. This type of optimization allows to reduce the memory footprint and inference latency.\n", + "By default the quantization scheme for int8/int4 will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`.\n", + "\n", + "For INT4 quantization you can also specify the following arguments :\n", + "- The `--group-size` parameter will define the group size to use for quantization, -1 it will results in per-column quantization.\n", + "- The `--ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to int4 while 10% will be quantized to int8.\n", + "\n", + "Smaller group_size and ratio values usually improve accuracy at the sacrifice of the model size and inference latency.\n", + "You can enable AWQ to be additionally applied during model export with INT4 precision using `--awq` flag and providing dataset name with `--dataset`parameter (e.g. `--dataset wikitext2`)\n", + "\n", + ">**Note**: Applying AWQ requires significant memory and time.\n", + "\n", + ">**Note**: It is possible that there will be no matching patterns in the model to apply AWQ, in such case it will be skipped.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "981df8fe-cfcf-455a-919e-dda36f3b5dfb", + "metadata": {}, + "outputs": [], + "source": [ + "from llm_config import convert_and_compress_model\n", + "\n", + "model_dir = convert_and_compress_model(model_id, model_configuration, compression_variant.value, use_preconverted=use_preconverted.value)" + ] + }, + { + "cell_type": "markdown", + "id": "d0b18586-b559-4281-942c-356e19f880ac", + "metadata": {}, + "source": [ + "## Configure MCP servers\n", + "\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "MCP server can be configured into an [MCP client](https://github.com/modelcontextprotocol/servers?tab=readme-ov-file#using-an-mcp-client). The configuration of MCP server be selected from [public MCP servers list](https://github.com/punkpeye/awesome-mcp-servers), or from your customized MCP server." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "625a2e19-56be-478c-87cf-4d906b08a20f", + "metadata": {}, + "source": [ + "## Create An agent\n", + "\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Function calling allows a model to detect when one or more tools should be called and respond with the inputs that should be passed to those tools. In an API call, you can describe tools and have the model intelligently choose to output a structured object like JSON containing arguments to call these tools. The goal of tools APIs is to more reliably return valid and useful tool calls than what can be done using a generic text completion or chat API.\n", + "\n", + "We can take advantage of this structured output, combined with the fact that you can bind multiple tools to a tool calling chat model and allow the model to choose which one to call, to create an agent that repeatedly calls tools and receives results until a query is resolved.\n", + "\n", + "OpenVINO has been integrated into the `Qwen-Agent` framework. You can use following method to create a OpenVINO based LLM for a `Qwen-Agent` pipeline.\n", + "Qwen-Agent offers a generic Agent class: the Assistant class, which, when directly instantiated, can handle the majority of Single-Agent tasks. Features:\n", + "\n", + "- It supports role-playing.\n", + "- It provides automatic planning and tool calls abilities.\n", + "- RAG (Retrieval-Augmented Generation): It accepts documents input, and can use an integrated RAG strategy to parse the documents.\n", + "\n", + "MCP server can be configured into an [MCP client](https://github.com/modelcontextprotocol/servers?tab=readme-ov-file#using-an-mcp-client). The configuration of MCP server be selected from [public MCP servers list](https://github.com/punkpeye/awesome-mcp-servers), or from your customized MCP server. Since the examples of the MCP server in this notebook are in remote, please make sure your system is connected with internet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abfaab28-fd5b-46cd-88ad-b60ea5a3cdd6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting mcp_test.py\n" + ] + } + ], + "source": [ + "%%writefile mcp_test.py\n", + "\n", + "import argparse\n", + "import openvino.properties as props\n", + "import openvino.properties.hint as hints\n", + "import openvino.properties.streams as streams\n", + "from qwen_agent.agents import Assistant\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " parser = argparse.ArgumentParser(add_help=False)\n", + " parser.add_argument('-h',\n", + " '--help',\n", + " action='help',\n", + " help='Show this help message and exit.')\n", + " parser.add_argument('-m',\n", + " '--model_dir',\n", + " required=True,\n", + " type=str,\n", + " help='Required. model path')\n", + " parser.add_argument('-d',\n", + " '--device',\n", + " default='CPU',\n", + " required=False,\n", + " type=str,\n", + " help='Required. device for inference')\n", + " args = parser.parse_args()\n", + "\n", + " tools = [\n", + " {\n", + " 'mcpServers': { # You can specify the MCP configuration file\n", + " 'time': {\n", + " 'command': 'python',\n", + " 'args': ['-m', 'mcp_server_time', '--local-timezone=Asia/Shanghai']\n", + " },\n", + " 'fetch': {\n", + " 'command': 'python',\n", + " 'args': ['-m', 'mcp_server_fetch']\n", + " }\n", + " }\n", + " },\n", + " 'code_interpreter', # Built-in tools\n", + " ]\n", + "\n", + "\n", + " llm_cfg = {\n", + " \"ov_model_dir\": args.model_dir,\n", + " \"model_type\": \"openvino-genai\",\n", + " \"device\": args.device,\n", + " \"disable_thinking\": True,\n", + " \"chat_mode\": True,\n", + " \"genai_chat_template\":\"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}\"\n", + " }\n", + "\n", + " bot = Assistant(llm=llm_cfg,\n", + " system_message=\"/no_think \",\n", + " function_list=tools,\n", + " name='Qwen3 Tool-calling Demo',\n", + " description=\"I'm a demo using the Qwen3 tool calling. Welcome to add and play with your own tools!\")\n", + "\n", + " messages = [{'role': 'user', 'content': 'What time is it?'}]\n", + " response_plain_text = ''\n", + " for response in bot.run(messages=messages):\n", + " pass\n", + " print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49e50941-16dd-4229-9b63-cd055a882915", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-28 17:01:10,144 - mcp_manager.py - 122 - INFO - Initializing MCP tools from mcp servers: ['time', 'fetch']\n", + "2025-05-28 17:01:10,159 - mcp_manager.py - 340 - INFO - Initializing a MCP stdio_client, if this takes forever, please check the config of this mcp server: time\n", + "2025-05-28 17:01:27,661 - mcp_manager.py - 350 - INFO - No list resources: Method not found\n", + "2025-05-28 17:01:27,670 - mcp_manager.py - 340 - INFO - Initializing a MCP stdio_client, if this takes forever, please check the config of this mcp server: fetch\n", + "Downloading lxml (3.6MiB)\n", + " Downloading lxml\n", + "Installed 36 packages in 181ms\n", + "2025-05-28 17:01:32,959 - mcp_manager.py - 350 - INFO - No list resources: Method not found\n", + "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "2025-05-28 17:01:44,609 - mcp_manager.py - 277 - INFO - There are still tasks in `MCPManager().loop`, force terminating the MCP tool processes. There may be some exceptions.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "[TOOL_CALL] \n", + "ime-get_current_time\n", + "{\"timezone\": \"Asia/Shanghai\"}\n", + "[TOOL_RESPONSE] time-get_current_time\n", + "{\n", + " \"timezone\": \"Asia/Shanghai\",\n", + " \"datetime\": \"2025-05-28T17:01:38+08:00\",\n", + " \"is_dst\": false\n", + "}\n", + "\n", + "\n", + "\n", + "\n", + "The current time in Asia/Shanghai is 2025-05-28T17:01:38+08:00.\n" + ] + } + ], + "source": [ + "!python mcp_test.py --model_dir {str(model_dir)} --device {device.value}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "688ced57", + "metadata": {}, + "source": [ + "## Interactive Demo\n", + "\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Let's create a interactive agent using [Gradio](https://www.gradio.app/)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3c631309-8696-4670-89a6-40ea8f357661", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from PIL import Image\n", + "import requests\n", + "\n", + "openvino_logo = \"openvino_logo.png\"\n", + "openvino_logo_url = \"https://cdn-avatars.huggingface.co/v1/production/uploads/1671615670447-6346651be2dcb5422bcd13dd.png\"\n", + "\n", + "if not Path(openvino_logo).exists():\n", + " image = Image.open(requests.get(openvino_logo_url, stream=True).raw)\n", + " image.save(openvino_logo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "752e052d-74d7-4df7-81eb-bbf317196ab1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting mcp_demo.py\n" + ] + } + ], + "source": [ + "%%writefile mcp_demo.py\n", + "\n", + "import argparse\n", + "import openvino.properties as props\n", + "import openvino.properties.hint as hints\n", + "import openvino.properties.streams as streams\n", + "from qwen_agent.agents import Assistant\n", + "from gradio_helper import OpenVINOUI\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " parser = argparse.ArgumentParser(add_help=False)\n", + " parser.add_argument('-h',\n", + " '--help',\n", + " action='help',\n", + " help='Show this help message and exit.')\n", + " parser.add_argument('-m',\n", + " '--model_dir',\n", + " required=True,\n", + " type=str,\n", + " help='Required. model path')\n", + " parser.add_argument('-d',\n", + " '--device',\n", + " default='CPU',\n", + " required=False,\n", + " type=str,\n", + " help='Required. device for inference')\n", + " args = parser.parse_args()\n", + "\n", + " tools = [\n", + " {\n", + " 'mcpServers': { # You can specify the MCP configuration file\n", + " 'time': {\n", + " 'command': 'python',\n", + " 'args': ['-m', 'mcp_server_time', '--local-timezone=Asia/Shanghai']\n", + " },\n", + " 'fetch': {\n", + " 'command': 'python',\n", + " 'args': ['-m', 'mcp_server_fetch']\n", + " },\n", + " }\n", + " },\n", + " 'code_interpreter', # Built-in tools\n", + " ]\n", + "\n", + "\n", + " llm_cfg = {\n", + " \"ov_model_dir\": args.model_dir,\n", + " \"model_type\": \"openvino-genai\",\n", + " \"device\": args.device,\n", + " \"chat_mode\": True,\n", + " \"disable_thinking\": True,\n", + " \"genai_chat_template\":\"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}\"\n", + " }\n", + "\n", + " bot = Assistant(llm=llm_cfg,\n", + " system_message=\"/no_think \",\n", + " function_list=tools,\n", + " name='OpenVINO MCP Demo',\n", + " description=\"I'm a demo using the Qwen3 tool calling. Welcome to add and play with your own tools!\")\n", + "\n", + " chatbot_config = {\n", + " 'prompt.suggestions': [\n", + " 'What time is it?',\n", + " \"Covert time of Shanghai to New York\"\n", + " ],\n", + " 'agent.avatar': \"openvino_logo.png\",\n", + " 'input.placeholder': \"Type your message here...\",\n", + " }\n", + "\n", + " demo = OpenVINOUI(\n", + " bot,\n", + " chatbot_config=chatbot_config,\n", + " )\n", + " demo.run(server_port=7860)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9650038b-6934-4a50-851b-41a95e5ea4db", + "metadata": {}, + "outputs": [], + "source": [ + "!python mcp_demo.py --model_dir {str(model_dir)} --device {device.value}" + ] + }, + { + "cell_type": "markdown", + "id": "adffcdc3-5520-491f-a3f9-aa426d24edcf", + "metadata": {}, + "source": [ + "Now you can visit [http://127.0.0.1:7860](http://127.0.0.1:7860) to try this demo. \n", + "If you are launching remotely, specify server_name and server_port. EXAMPLE: \n", + "\n", + "`demo.run(server_name='your server name', server_port='server port in int')`" + ] + }, + { + "cell_type": "markdown", + "id": "c35e9377-ecc6-4d97-a650-b5c523b2b775", + "metadata": {}, + "source": [ + "To kill the process of demo, you can run following command:\n", + "\n", + "on *Windows*\n", + "\n", + "\n", + "`!for /f \"tokens=5\" %a in ('netstat -aon ^| findstr \":7860 \"') do taskkill /f /pid %a`" + ] + }, + { + "cell_type": "markdown", + "id": "d1a3d7d9-2260-4140-a79e-e9ed36cce96c", + "metadata": {}, + "source": [ + "on *Linux*\n", + "\n", + "`!kill -9 $(lsof -t -i :7860)`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "openvino_notebooks": { + "imageUrl": "https://github.com/user-attachments/assets/dfe1aa42-cae9-4356-be81-f010462d78a8", + "tags": { + "categories": [ + "Model Demos", + "AI Trends" + ], + "libraries": [], + "other": [ + "LLM" + ], + "tasks": [ + "Text Generation" + ] + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/llm-agent-mcp/llm_config.py b/notebooks/llm-agent-mcp/llm_config.py new file mode 100644 index 00000000000..80c64c9ebb8 --- /dev/null +++ b/notebooks/llm-agent-mcp/llm_config.py @@ -0,0 +1,166 @@ +SUPPORTED_LLM_MODELS = { + "English": { + "Qwen/Qwen3-8B": { + "model_id": "Qwen/Qwen3-8B", + }, + "Qwen/Qwen3-4B": { + "model_id": "Qwen/Qwen3-4B", + }, + }, + "Chinese": { + "Qwen/Qwen3-8B": { + "model_id": "Qwen/Qwen3-8B", + }, + "Qwen/Qwen3-4B": { + "model_id": "Qwen/Qwen3-4B", + }, + }, +} + + +compression_configs = { + "default": { + "sym": False, + "group_size": 128, + "ratio": 0.8, + }, +} + + +def get_optimum_cli_command(model_id, weight_format, output_dir, compression_options=None, enable_awq=False, trust_remote_code=False): + base_command = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format {}" + command = base_command.format(model_id, weight_format) + if compression_options: + compression_args = " --group-size {} --ratio {}".format(compression_options["group_size"], compression_options["ratio"]) + if compression_options["sym"]: + compression_args += " --sym" + if enable_awq or compression_options.get("awq", False): + compression_args += " --awq --dataset wikitext2 --num-samples 128" + if compression_options.get("scale_estimation", False): + compression_args += " --scale-estimation" + if compression_options.get("all_layers", False): + compression_args += " --all-layers" + + command = command + compression_args + if trust_remote_code: + command += " --trust-remote-code" + + command += " {}".format(output_dir) + return command + + +default_language = "English" + +SUPPORTED_OPTIMIZATIONS = ["INT4", "INT4-AWQ", "INT4-NPU", "INT8", "FP16"] + +int4_npu_config = { + "sym": True, + "group_size": -1, + "ratio": 1.0, +} + + +def get_llm_selection_widget( + languages=list(SUPPORTED_LLM_MODELS), models=SUPPORTED_LLM_MODELS[default_language], show_preconverted_checkbox=True, device=None, default_model_idx=0 +): + import ipywidgets as widgets + + lang_dropdown = widgets.Dropdown(options=languages or []) + + # Define dependent drop down + + model_dropdown = widgets.Dropdown(options=models, value=models[list(models)[default_model_idx]]) + + def dropdown_handler(change): + global default_language + default_language = change.new + # If statement checking on dropdown value and changing options of the dependent dropdown accordingly + model_dropdown.options = SUPPORTED_LLM_MODELS[change.new] + model_dropdown.value = SUPPORTED_LLM_MODELS[change.new][list(SUPPORTED_LLM_MODELS[change.new])[default_model_idx]] + + lang_dropdown.observe(dropdown_handler, names="value") + compression_dropdown = widgets.Dropdown(options=SUPPORTED_OPTIMIZATIONS if device != "NPU" else ["INT4-NPU", "FP16"]) + preconverted_checkbox = widgets.Checkbox(value=True) + + form_items = [] + + if languages: + form_items.append(widgets.Box([widgets.Label(value="Language:"), lang_dropdown])) + form_items.extend( + [ + widgets.Box([widgets.Label(value="Model:"), model_dropdown]), + widgets.Box([widgets.Label(value="Compression:"), compression_dropdown]), + ] + ) + if show_preconverted_checkbox: + form_items.append(widgets.Box([widgets.Label(value="Use preconverted models:"), preconverted_checkbox])) + + form = widgets.Box( + form_items, + layout=widgets.Layout( + display="flex", + flex_flow="column", + border="solid 1px", + # align_items='stretch', + width="30%", + padding="1%", + ), + ) + return form, lang_dropdown, model_dropdown, compression_dropdown, preconverted_checkbox + + +def convert_and_compress_model(model_id, model_config, precision, use_preconverted=False): + from pathlib import Path + from IPython.display import Markdown, display + import subprocess # nosec - disable B404:import-subprocess check + import platform + + pt_model_id = model_config["model_id"] + pt_model_name = model_id.split("/")[-1] + model_subdir = precision if precision == "FP16" else precision + "_compressed_weights" + model_dir = Path(pt_model_name) / model_subdir + remote_code = model_config.get("remote_code", False) + if (model_dir / "openvino_model.xml").exists(): + print(f"✅ {precision} {model_id} model already converted and can be found in {model_dir}") + return model_dir + if use_preconverted: + OV_ORG = "OpenVINO" + pt_model_name = pt_model_id.split("/")[-1] + ov_model_name = pt_model_name + f"-{precision.lower()}-ov" + ov_model_hub_id = f"{OV_ORG}/{ov_model_name}" + import huggingface_hub as hf_hub + + hub_api = hf_hub.HfApi() + if hub_api.repo_exists(ov_model_hub_id): + print(f"⌛Found preconverted {precision} {model_id}. Downloading model started. It may takes some time.") + hf_hub.snapshot_download(ov_model_hub_id, local_dir=model_dir) + print(f"✅ {precision} {model_id} model downloaded and can be found in {model_dir}") + return model_dir + + model_compression_params = {} + if "INT4" in precision: + model_compression_params = compression_configs.get(model_id, compression_configs["default"]) if not "NPU" in precision else int4_npu_config + weight_format = precision.split("-")[0].lower() + optimum_cli_command = get_optimum_cli_command(pt_model_id, weight_format, model_dir, model_compression_params, "AWQ" in precision, remote_code) + print(f"⌛ {model_id} conversion to {precision} started. It may takes some time.") + display(Markdown("**Export command:**")) + display(Markdown(f"`{optimum_cli_command}`")) + subprocess.run(optimum_cli_command.split(" "), shell=(platform.system() == "Windows"), check=True) + print(f"✅ {precision} {model_id} model converted and can be found in {model_dir}") + return model_dir + + +def compare_model_size(model_dir): + fp16_weights = model_dir.parent / "FP16" / "openvino_model.bin" + int8_weights = model_dir.parent / "INT8_compressed_weights" / "openvino_model.bin" + int4_weights = model_dir.parent / "INT4_compressed_weights" / "openvino_model.bin" + int4_awq_weights = model_dir.parent / "INT4-AWQ_compressed_weights" / "openvino_model.bin" + int4_npu_weights = model_dir.parent / "INT4-NPU_compressed_weights" / "openvino_model.bin" + + if fp16_weights.exists(): + print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") + for precision, compressed_weights in zip(["INT8", "INT4", "INT4-AWQ", "INT4-NPU"], [int8_weights, int4_weights, int4_awq_weights, int4_npu_weights]): + if compressed_weights.exists(): + print(f"Size of model with {precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB") + if compressed_weights.exists() and fp16_weights.exists(): + print(f"Compression rate for {precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}")