Skip to content

Commit 45f3b9a

Browse files
eaidovaandreyanufr
andauthored
add notebook (#2408)
Co-authored-by: Andrei Anufriev <[email protected]>
1 parent 4896dbb commit 45f3b9a

File tree

11 files changed

+2229
-3
lines changed

11 files changed

+2229
-3
lines changed

.ci/ignore_convert_execution.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,5 @@ notebooks/hunyuan-dit-image-generation/hunyuan-dit-image-generation.ipynb
6161
notebooks/stable-diffusion-v3/stable-diffusion-v3.ipynb
6262
notebooks/llm-rag-llamaindex/llm-rag-llamaindex.ipynb
6363
notebooks/llm-agent-functioncall/llm-agent-functioncall-qwen.ipynb
64-
notebooks/llm-agent-react/llm-agent-rag-llamaindex.ipynb
64+
notebooks/llm-agent-react/llm-agent-rag-llamaindex.ipynb
65+
notebooks/mllama-3.2/mllama-3.2.ipynb

.ci/ignore_treon_docker.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,5 @@ notebooks/internvl2/internvl2.ipynb
8181
notebooks/qwen2-vl/qwen2-vl.ipynb
8282
notebooks/qwen2-audio/qwen2-audio.ipynb
8383
notebooks/stable-fast-3d/stable-fast-3d.ipynb
84+
notebooks/mllama-3.2/mllama-3.2.ipynb
8485
notebooks/segment-anything/segment-anything-2-image.ipynb

.ci/skipped_notebooks.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,13 @@
587587
- '3.8'
588588
- os:
589589
- macos-12
590+
- notebook: notebooks/mllama-3.2/mllama-3.2.ipynb
591+
skips:
592+
- os:
593+
- macos-12
594+
- ubuntu-20.04
595+
- ubuntu-22.04
596+
- windows-2019
590597
- notebook: notebooks/llm-agent-react/llm-agent-react-langchain.ipynb
591598
skips:
592599
- python:

.ci/spellcheck/.pyspelling.wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,7 @@ llava
436436
llm
437437
LLM
438438
LLMs
439+
LM
439440
LMS
440441
LLMPipeline
441442
logits

notebooks/mllama-3.2/README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Visual-language assistant with Llama-3.2-11B-Vision and OpenVINO
2+
3+
Llama-3.2-11B-Vision is the latest model from LLama3 model family those capabilities extended to understand images content.
4+
More details about model can be found in [model card](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/MODEL_CARD_VISION.md), and original [repo](https://github.com/meta-llama/llama-models).
5+
6+
In this tutorial we consider how to convert and optimize Llama-Vision model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression and quantization using [NNCF](https://github.com/openvinotoolkit/nncf)
7+
8+
## Notebook contents
9+
The tutorial consists from following steps:
10+
11+
- Install requirements
12+
- Convert model
13+
- Optimize Language model using weights compression
14+
- Optimize Image encoder using post-training quantization
15+
- Run OpenVINO model inference
16+
- Launch Interactive demo
17+
18+
In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content.
19+
20+
The image bellow illustrates example of input prompt and model answer.
21+
![example.png](https://github.com/user-attachments/assets/1e3fde78-bae5-4b9a-8ef3-ea1291b288cf)
22+
23+
## Installation instructions
24+
This is a self-contained example that relies solely on its own code.</br>
25+
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
26+
For details, please refer to [Installation Guide](../../README.md).
27+
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/mllama-3.2/README.md" />
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
import torch
2+
from datasets import load_dataset
3+
from transformers import AutoProcessor
4+
from tqdm.autonotebook import tqdm
5+
from pathlib import Path
6+
import pickle
7+
import gc
8+
9+
import requests
10+
from io import BytesIO
11+
import numpy as np
12+
from PIL import Image
13+
from requests.packages.urllib3.exceptions import InsecureRequestWarning
14+
15+
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
16+
from ov_mllama_helper import OVMLlamaForConditionalGeneration
17+
18+
19+
max_length = 4048
20+
21+
22+
def check_text_data(data):
23+
"""
24+
Check if the given data is text-based.
25+
"""
26+
if isinstance(data, str):
27+
return True
28+
if isinstance(data, list):
29+
return all(isinstance(x, str) for x in data)
30+
return False
31+
32+
33+
def get_pil_from_url(url):
34+
"""
35+
Downloads and converts an image from a URL to a PIL Image object.
36+
"""
37+
response = requests.get(url, verify=False, timeout=20)
38+
image = Image.open(BytesIO(response.content))
39+
return image.convert("RGB")
40+
41+
42+
# def collate_fn_llm(example, image_column="image_url", text_column="caption"):
43+
# """
44+
# Preprocesses an example by loading and transforming image and text data.
45+
# Checks if the text data in the example is valid by calling the `check_text_data` function.
46+
# Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
47+
# If there is any error during the download process, returns None.
48+
# Returns the preprocessed inputs with transformed image and text data.
49+
# """
50+
# assert len(example) == 1
51+
# example = example[0]
52+
53+
# if not check_text_data(example[text_column]):
54+
# raise ValueError("Text data is not valid")
55+
56+
# url = example[image_column]
57+
# try:
58+
# image = get_pil_from_url(url)
59+
# h, w = image.size
60+
# if h == 1 or w == 1:
61+
# return None
62+
# except Exception:
63+
# return None
64+
65+
# inputs = processor(text="<|image|><|begin_of_text|>"+example[text_column], images=image, return_tensors="pt", padding=True)
66+
# if inputs['input_ids'].shape[1] > max_length:
67+
# return None
68+
# return inputs
69+
70+
71+
def prepare_calibration_data_vision(dataloader, init_steps):
72+
"""
73+
This function prepares calibration data from a dataloader for a specified number of initialization steps.
74+
It iterates over the dataloader, fetching batches and storing the relevant data.
75+
"""
76+
prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
77+
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
78+
image = Image.open(requests.get(url, stream=True).raw)
79+
model_id = "Llama-3.2-11B-Vision-Instruct/OV"
80+
processor = AutoProcessor.from_pretrained(model_id)
81+
inputs = processor(text=prompt, images=image, return_tensors="pt")
82+
data = []
83+
print(f"Fetching {init_steps} samples for the initialization...")
84+
with tqdm(total=init_steps) as pbar:
85+
for batch in dataloader:
86+
if len(data) == init_steps:
87+
break
88+
if batch:
89+
pbar.update(1)
90+
with torch.no_grad():
91+
data.append(
92+
{
93+
"pixel_values": batch["pixel_values"].to("cpu"),
94+
"aspect_ratio_ids": inputs.data["aspect_ratio_ids"].to("cpu"),
95+
"aspect_ratio_mask": inputs.data["aspect_ratio_mask"],
96+
}
97+
)
98+
return data
99+
100+
101+
def prepare_dataset_vision(processor, opt_init_steps=50, max_train_samples=1000, file_path="vision_dataset.pickle", save_dataset=True):
102+
"""
103+
Prepares a vision-text dataset for quantization.
104+
"""
105+
106+
def collate_fn(example, image_column="image_url", text_column="caption"):
107+
"""
108+
Preprocesses an example by loading and transforming image and text data.
109+
Checks if the text data in the example is valid by calling the `check_text_data` function.
110+
Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
111+
If there is any error during the download process, returns None.
112+
Returns the preprocessed inputs with transformed image and text data.
113+
"""
114+
assert len(example) == 1
115+
example = example[0]
116+
117+
if not check_text_data(example[text_column]):
118+
raise ValueError("Text data is not valid")
119+
120+
url = example[image_column]
121+
try:
122+
image = get_pil_from_url(url)
123+
h, w = image.size
124+
if h == 1 or w == 1:
125+
return None
126+
except Exception:
127+
return None
128+
inputs = processor(
129+
text="<|image|><|begin_of_text|> Please describe image content based on information: " + example[text_column],
130+
images=image,
131+
return_tensors="pt",
132+
padding=True,
133+
)
134+
if inputs["input_ids"].shape[1] > max_length:
135+
return None
136+
return inputs
137+
138+
if not Path(file_path).exists():
139+
dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True)
140+
train_dataset = dataset["train"].shuffle(seed=42)
141+
dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)
142+
calibration_data = prepare_calibration_data_vision(dataloader, opt_init_steps)
143+
print(f"calibration dataset will be saved in {file_path}")
144+
with open(file_path, "wb") as f:
145+
pickle.dump(calibration_data, f)
146+
else:
147+
with open(file_path, "rb") as f:
148+
calibration_data = pickle.load(f)
149+
150+
return calibration_data
151+
152+
153+
def prepare_calibration_data_llm(dataloader, init_steps, mllm, processor):
154+
"""
155+
This function prepares calibration data from a dataloader for a specified number of initialization steps.
156+
It iterates over the dataloader, fetching batches and storing the relevant data.
157+
"""
158+
data = []
159+
160+
prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
161+
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
162+
image = Image.open(requests.get(url, stream=True).raw)
163+
inputs = processor(text=prompt, images=image, return_tensors="pt")
164+
165+
print(f"Fetching {init_steps} samples for the initialization...")
166+
with tqdm(total=init_steps) as pbar:
167+
for batch in dataloader:
168+
if len(data) == init_steps:
169+
break
170+
if batch:
171+
pbar.update(1)
172+
with torch.no_grad():
173+
cache_position = np.cumsum(batch.data["attention_mask"].to("cpu"), axis=1) - 1
174+
cache_position[batch.data["attention_mask"] == 0] = 1
175+
176+
vision_input = {
177+
"pixel_values": batch["pixel_values"].to("cpu"),
178+
"aspect_ratio_ids": batch.data["aspect_ratio_ids"].to("cpu"),
179+
"aspect_ratio_mask": batch.data["aspect_ratio_mask"].to("cpu"),
180+
"cross_attention_mask": batch.data["cross_attention_mask"].to("cpu"),
181+
"cache_position": cache_position[0, :],
182+
}
183+
184+
cross_attention_states = mllm.prepare_vision_outputs(**vision_input)
185+
res = {"input_ids": batch.data["input_ids"].to("cpu"), "attention_mask": batch.data["attention_mask"].to("cpu"), **cross_attention_states}
186+
position_ids = np.cumsum(res["attention_mask"], axis=1) - 1
187+
position_ids[res["attention_mask"] == 0] = 1
188+
res["position_ids"] = position_ids
189+
190+
res = mllm.prepare_llm_inputs(**res)
191+
data.append(res)
192+
return data
193+
194+
195+
def prepare_dataset_llm(mllm_id, opt_init_steps=50, max_train_samples=1000, file_path="llm_dataset.pickle", save_dataset=False):
196+
"""
197+
Prepares a vision-text dataset for quantization.
198+
"""
199+
200+
if Path(file_path).exists():
201+
print(f"callibration dataset will be loaded from {file_path}")
202+
with open(file_path, "rb") as f:
203+
calibration_data = pickle.load(f)
204+
return calibration_data
205+
206+
mllm = OVMLlamaForConditionalGeneration(mllm_id, slice_lm_head=False)
207+
processor = AutoProcessor.from_pretrained(mllm_id)
208+
209+
def collate_fn(example, image_column="image_url", text_column="caption"):
210+
"""
211+
Preprocesses an example by loading and transforming image and text data.
212+
Checks if the text data in the example is valid by calling the `check_text_data` function.
213+
Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
214+
If there is any error during the download process, returns None.
215+
Returns the preprocessed inputs with transformed image and text data.
216+
"""
217+
assert len(example) == 1
218+
example = example[0]
219+
220+
if not check_text_data(example[text_column]):
221+
raise ValueError("Text data is not valid")
222+
223+
url = example[image_column]
224+
try:
225+
image = get_pil_from_url(url)
226+
h, w = image.size
227+
if h == 1 or w == 1:
228+
return None
229+
except Exception:
230+
return None
231+
inputs = processor(
232+
text="<|image|><|begin_of_text|> Please describe image content based on information: " + example[text_column],
233+
images=image,
234+
return_tensors="pt",
235+
padding=True,
236+
)
237+
if inputs["input_ids"].shape[1] > max_length:
238+
return None
239+
return inputs
240+
241+
dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True)
242+
train_dataset = dataset["train"].shuffle(seed=42)
243+
dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)
244+
calibration_data = prepare_calibration_data_llm(dataloader, opt_init_steps, mllm, processor)
245+
246+
if save_dataset:
247+
with open(file_path, "wb") as f:
248+
print(f"calibration data will be saved into {file_path}")
249+
pickle.dump(calibration_data, f)
250+
251+
del mllm
252+
gc.collect()
253+
254+
return calibration_data

0 commit comments

Comments
 (0)