Merge branch 'ea/flex2' of https://github.com/eaidova/openvino_notebooks into ea/flex2

eaidova · eaidova · commit d045c75d3504 · 2025-05-23T13:14:40.000+04:00
diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt
@@ -39,6 +39,7 @@ notebooks/stable-diffusion-ip-adapter/stable-diffusion-ip-adapter.ipynb
 notebooks/kosmos2-multimodal-large-language-model/kosmos2-multimodal-large-language-model.ipynb
 notebooks/photo-maker/photo-maker.ipynb
 notebooks/openvoice/openvoice.ipynb
+notebooks/openvoice2-and-melotts/openvoice2-and-melotts.ipynb
 notebooks/surya-line-level-text-detection/surya-line-level-text-detection.ipynb
 notebooks/instant-id/instant-id.ipynb
 notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb
diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml
@@ -546,3 +546,7 @@
   skips:
     - python:
         - "3.9"
+- notebook: notebooks/openvoice2-and-melotts/openvoice2-and-melotts.ipynb
+  skips:
+    - os:
+        - macos-13
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -543,6 +543,7 @@ md
 MediaPipe
 medprob
 mel
+MeloTTS
 Mels
 MERCHANTABILITY
 MF
diff --git a/notebooks/README.md b/notebooks/README.md
@@ -49,6 +49,7 @@
 - [Text-to-image generation using PhotoMaker and OpenVINO](./photo-maker/photo-maker.ipynb)
 - [Multimodal assistant with Phi-4-multimodal and OpenVINO](./phi-4-multimodal/phi-4-multimodal.ipynb)
 - [Visual-language assistant with Phi3-Vision and OpenVINO](./phi-3-vision/phi-3-vision.ipynb)
+- [Voice tone cloning with OpenVoice2 and MeloTTS for Text-to-Speech by OpenVINO](./openvoice2-and-melotts/openvoice2-and-melotts.ipynb)
 - [Voice tone cloning with OpenVoice and OpenVINO](./openvoice/openvoice.ipynb)
 - [Running OpenCLIP models using OpenVINO™](./open-clip/open-clip.ipynb)
 - [Screen Parsing with OmniParser-v2.0 and OpenVINO](./omniparser/omniparser.ipynb)
@@ -147,6 +148,7 @@
 - [Line-level text detection with Surya](./surya-line-level-text-detection/surya-line-level-text-detection.ipynb)
 - [Convert a PyTorch Model to OpenVINO™ IR](./pytorch-to-openvino/pytorch-to-openvino.ipynb)
 - [Convert a PaddlePaddle Model to OpenVINO™ IR](./paddle-to-openvino/paddle-to-openvino-classification.ipynb)
+- [Voice tone cloning with OpenVoice2 and MeloTTS for Text-to-Speech by OpenVINO](./openvoice2-and-melotts/openvoice2-and-melotts.ipynb)
 - [Voice tone cloning with OpenVoice and OpenVINO](./openvoice/openvoice.ipynb)
 - [OpenVINO Tokenizers: Incorporate Text Processing Into OpenVINO Pipelines](./openvino-tokenizers/openvino-tokenizers.ipynb)
 - [Object detection and masking from prompts with GroundedSAM (GroundingDINO + SAM) and OpenVINO](./grounded-segment-anything/grounded-segment-anything.ipynb)
@@ -178,6 +180,7 @@
 - [Person Tracking with OpenVINO™](./person-tracking-webcam/person-tracking.ipynb)
 - [Person Counting System using YOLOV8 and OpenVINO™](./person-counting-webcam/person-counting.ipynb)
 - [PaddleOCR with OpenVINO™](./paddle-ocr-webcam/paddle-ocr-webcam.ipynb)
+- [Voice tone cloning with OpenVoice2 and MeloTTS for Text-to-Speech by OpenVINO](./openvoice2-and-melotts/openvoice2-and-melotts.ipynb)
 - [Voice tone cloning with OpenVoice and OpenVINO](./openvoice/openvoice.ipynb)
 - [Live Object Detection with OpenVINO™](./object-detection-webcam/object-detection.ipynb)
 - [CLIP model with Jina CLIP and OpenVINO](./jina-clip/jina-clip.ipynb)
@@ -250,6 +253,7 @@
 - [Text-to-speech (TTS) with Parler-TTS and OpenVINO](./parler-tts-text-to-speech/parler-tts-text-to-speech.ipynb)
 - [Text-to-Speech synthesis using OuteTTS and OpenVINO](./outetts-text-to-speech/outetts-text-to-speech.ipynb)
 - [Optical Character Recognition (OCR) with OpenVINO™](./optical-character-recognition/optical-character-recognition.ipynb)
+- [Voice tone cloning with OpenVoice2 and MeloTTS for Text-to-Speech by OpenVINO](./openvoice2-and-melotts/openvoice2-and-melotts.ipynb)
 - [Voice tone cloning with OpenVoice and OpenVINO](./openvoice/openvoice.ipynb)
 - [Running OpenCLIP models using OpenVINO™](./open-clip/open-clip.ipynb)
 - [Universal Segmentation with OneFormer and OpenVINO](./oneformer-segmentation/oneformer-segmentation.ipynb)
@@ -344,6 +348,7 @@
 - [Quantization Aware Training with NNCF, using PyTorch framework](./pytorch-quantization-aware-training/pytorch-quantization-aware-training.ipynb)
 - [Post-Training Quantization of PyTorch models with NNCF](./pytorch-post-training-quantization-nncf/pytorch-post-training-quantization-nncf.ipynb)
 - [Optimize Preprocessing](./optimize-preprocessing/optimize-preprocessing.ipynb)
+- [Voice tone cloning with OpenVoice2 and MeloTTS for Text-to-Speech by OpenVINO](./openvoice2-and-melotts/openvoice2-and-melotts.ipynb)
 - [Voice tone cloning with OpenVoice and OpenVINO](./openvoice/openvoice.ipynb)
 - [OpenVINO Tokenizers: Incorporate Text Processing Into OpenVINO Pipelines](./openvino-tokenizers/openvino-tokenizers.ipynb)
 - [Quantize NLP models with Post-Training Quantization ​in NNCF](./language-quantize-bert/language-quantize-bert.ipynb)
diff --git a/notebooks/llm-rag-langchain/llm-rag-langchain-genai.ipynb b/notebooks/llm-rag-langchain/llm-rag-langchain-genai.ipynb
@@ -880,7 +880,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "id": "d0bab20b",
    "metadata": {},
    "outputs": [],
@@ -892,7 +892,7 @@
     "display(Markdown(f\"`{export_command}`\"))\n",
     "\n",
     "if not Path(rerank_model_id.value).exists():\n",
-    "    optimum_cli(rerank_model_configuration[\"model_id\"], str(rerank_model_id.value), show_command=False, additional_args={\"task\": \"text-classificaton\"})"
+    "    optimum_cli(rerank_model_configuration[\"model_id\"], str(rerank_model_id.value), show_command=False, additional_args={\"task\": \"text-classification\"})"
    ]
   },
   {
diff --git a/notebooks/llm-rag-llamaindex/llm-rag-llamaindex.ipynb b/notebooks/llm-rag-llamaindex/llm-rag-llamaindex.ipynb
@@ -1224,7 +1224,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "id": "f7f708db-8de1-4efd-94b2-fcabc48d52f4",
    "metadata": {},
    "outputs": [
@@ -1288,7 +1288,9 @@
     "import openvino.properties as props\n",
     "import openvino.properties.hint as hints\n",
     "import openvino.properties.streams as streams\n",
+    "import openvino\n",
     "\n",
+    "core = openvino.Core()\n",
     "\n",
     "if model_to_run.value == \"INT4\":\n",
     "    model_dir = int4_model_dir\n",
diff --git a/notebooks/openvoice2-and-melotts/README.md b/notebooks/openvoice2-and-melotts/README.md
@@ -0,0 +1,33 @@
+# Voice tone cloning with OpenVoice2 and MeloTTS for Text-to-Speech by OpenVINO
+
+
+
+<!-- TODO: insert link with the image/gif -->
+![sdf](https://github.com/openvinotoolkit/openvino_notebooks/assets/5703039/ca7eab80-148d-45b0-84e8-a5a279846b51)
+
+OpenVoice2 is a versatile system for instant voice tone transferring and generating speech in various languages with just a brief audio snippet from the source speaker, using MeloTTS as the base speakers. OpenVoice2 includes all features from V1 and introduces several enhancements: (i) better audio quality: OpenVoice2 adopts a different training strategy that delivers superior audio quality. (ii) native multi-lingual support: English, Spanish, French, Chinese, Japanese, and Korean are natively supported. (iii) free commercial use: starting from April 2024, both V2 and V1 are released under the MIT License, allowing free commercial use.
+
+OpenVoice2 retains the core strengths of OpenVoice1, including accurate tone color cloning, flexible voice style control, and zero-shot cross-lingual voice cloning.
+
+More details about model can be found in [project web page](https://research.myshell.ai/open-voice), [paper](https://arxiv.org/abs/2312.01479), and official [repository](https://github.com/myshell-ai/OpenVoice)
+
+In this tutorial we will explore how to convert and run OpenVoice2 and MeloTTS using OpenVINO.
+
+## Notebook Contents
+
+This notebook demonstrates voice tone cloning with [OpenVoice](https://github.com/myshell-ai/OpenVoice) in OpenVINO.
+
+The tutorial consists of following steps:
+- Install prerequisites
+- Load PyTorch model
+- Convert Model to Openvino Intermediate Representation format
+- Run OpenVINO model inference on a single example
+- Launch interactive demo
+
+## Installation Instructions
+
+This is a self-contained example that relies solely on its own code.</br>
+We recommend  running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/openvoice2-and-melotts/README.md" />
diff --git a/notebooks/openvoice2-and-melotts/gradio_helper.py b/notebooks/openvoice2-and-melotts/gradio_helper.py
@@ -0,0 +1,92 @@
+from typing import Callable
+import gradio as gr
+
+
+description = """
+    # OpenVoice2 accelerated by OpenVINO:
+    
+    a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
+"""
+
+content = """
+<div>
+<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
+This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
+</div>
+"""
+wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
+
+
+examples = [
+    [
+        "Did you ever hear a folk tale about a giant turtle?",
+        "en_latest",
+        "OpenVoice/resources/demo_speaker0.mp3",
+        True,
+    ],
+    [
+        "我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。",
+        "zh_default",
+        "OpenVoice/resources/demo_speaker1.mp3",
+        True,
+    ],
+]
+
+
+def make_demo(fn: Callable):
+    with gr.Blocks(analytics_enabled=False) as demo:
+        with gr.Row():
+            gr.Markdown(description)
+        with gr.Row():
+            gr.HTML(wrapped_markdown_content)
+
+        with gr.Row():
+            with gr.Column():
+                input_text_gr = gr.Textbox(
+                    label="Text Prompt",
+                    info="One or two sentences at a time is better. Up to 50 text characters.",
+                    value="The bustling city square bustled with street performers, tourists, and local vendors.",
+                )
+                style_gr = gr.Dropdown(
+                    label="Style",
+                    info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
+                    choices=[
+                        "en_latest",
+                        "zh_default",
+                    ],
+                    max_choices=1,
+                    value="en_latest",
+                )
+                ref_gr = gr.Audio(
+                    label="Reference Audio",
+                    # info="Click on the button to upload your own target speaker audio",
+                    type="filepath",
+                    value="OpenVoice/resources/demo_speaker0.mp3",
+                )
+                tos_gr = gr.Checkbox(
+                    label="Agree",
+                    value=False,
+                    info="I agree to the terms of the MIT license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
+                )
+
+                tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
+
+            with gr.Column():
+                out_text_gr = gr.Text(label="Info")
+                audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
+                ref_audio_gr = gr.Audio(label="Reference Audio Used")
+
+                gr.Examples(
+                    examples,
+                    label="Examples",
+                    inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
+                    outputs=[out_text_gr, audio_gr, ref_audio_gr],
+                    fn=fn,
+                    cache_examples=False,
+                )
+                tts_button.click(
+                    fn,
+                    [input_text_gr, style_gr, ref_gr, tos_gr],
+                    outputs=[out_text_gr, audio_gr, ref_audio_gr],
+                )
+    return demo
diff --git a/notebooks/openvoice2-and-melotts/openvoice2-and-melotts.ipynb b/notebooks/openvoice2-and-melotts/openvoice2-and-melotts.ipynb

Original file line number	Diff line number	Diff line change
`@@ -880,7 +880,7 @@`
`880`	`880`	`},`
`881`	`881`	`{`
`882`	`882`	`"cell_type": "code",`
`883`		`- "execution_count": 16,`
	`883`	`+ "execution_count": null,`
`884`	`884`	`"id": "d0bab20b",`
`885`	`885`	`"metadata": {},`
`886`	`886`	`"outputs": [],`
`@@ -892,7 +892,7 @@`
`892`	`892`	"display(Markdown(f\"`{export_command}`\"))\n",
`893`	`893`	`"\n",`
`894`	`894`	`"if not Path(rerank_model_id.value).exists():\n",`
`895`		`- " optimum_cli(rerank_model_configuration[\"model_id\"], str(rerank_model_id.value), show_command=False, additional_args={\"task\": \"text-classificaton\"})"`
	`895`	`+ " optimum_cli(rerank_model_configuration[\"model_id\"], str(rerank_model_id.value), show_command=False, additional_args={\"task\": \"text-classification\"})"`
`896`	`896`	`]`
`897`	`897`	`},`
`898`	`898`	`{`