6
6
7
7
8
8
class VllmActor :
9
- def __init__ (self , model_path : str ,mm_processor_kwargs : dict ,free_gpus :list ) -> None :
9
+
10
+ def __init__ (self , model_path : str , mm_processor_kwargs : dict , free_gpus : list ) -> None :
10
11
"""
11
12
Overview:
12
13
Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
@@ -19,7 +20,7 @@ def __init__(self, model_path: str,mm_processor_kwargs: dict,free_gpus:list) ->
19
20
# Set CUDA_VISIBLE_DEVICES to use only free GPUs
20
21
os .environ ["CUDA_VISIBLE_DEVICES" ] = "," .join (map (str , self .free_gpus ))
21
22
self .model_path = model_path
22
- self .mm_processor_kwargs = mm_processor_kwargs
23
+ self .mm_processor_kwargs = mm_processor_kwargs
23
24
self ._initialize ()
24
25
25
26
def _initialize (self ) -> None :
@@ -58,7 +59,7 @@ async def generate(self, prompt, num_samples: int, max_tokens: int, temperature:
58
59
max_tokens = max_tokens ,
59
60
temperature = temperature ,
60
61
)
61
-
62
+
62
63
# Using async iterator to handle vLLM's generation process
63
64
# 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
64
65
# 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
@@ -77,11 +78,17 @@ class HuggingFaceModelGenerator:
77
78
A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
78
79
"""
79
80
80
- def __init__ (self , model_path : str , free_gpus :list ,
81
- max_tokens : int = 1024 , temperature : float = 0 , mm_processor_kwargs :dict = {
81
+ def __init__ (
82
+ self ,
83
+ model_path : str ,
84
+ free_gpus : list ,
85
+ max_tokens : int = 1024 ,
86
+ temperature : float = 0 ,
87
+ mm_processor_kwargs : dict = {
82
88
"min_pixels" : 28 * 28 ,
83
89
"max_pixels" : 1280 * 28 * 28 ,
84
- }) -> None :
90
+ }
91
+ ) -> None :
85
92
"""
86
93
Overview:
87
94
Initialize the Hugging Face model generator.
@@ -90,14 +97,14 @@ def __init__(self, model_path: str, free_gpus:list,
90
97
- max_tokens (int): The maximum number of tokens to generate, default to 1024.
91
98
- temperature (float): The temperature for the language model, default to 0.
92
99
"""
93
- self .vllm_actor = VllmActor (model_path ,mm_processor_kwargs ,free_gpus )
100
+ self .vllm_actor = VllmActor (model_path , mm_processor_kwargs , free_gpus )
94
101
self .max_tokens = max_tokens
95
102
self .temperature = temperature
96
103
97
104
async def generate (
98
- self ,
99
- prompt ,
100
- num_samples : int ,
105
+ self ,
106
+ prompt ,
107
+ num_samples : int ,
101
108
) -> List [Tuple [str , float ]]:
102
109
"""
103
110
Overview:
@@ -114,11 +121,8 @@ async def generate(
114
121
response = await self .vllm_actor .generate (prompt , num_samples , self .max_tokens , self .temperature )
115
122
# Use raw logprobs as confidence scores
116
123
confidence_scores = [x .cumulative_logprob for x in response .outputs ]
117
- return [
118
- (x .text .strip (), conf )
119
- for x , conf in zip (response .outputs , confidence_scores )
120
- ]
121
-
124
+ return [(x .text .strip (), conf ) for x , conf in zip (response .outputs , confidence_scores )]
125
+
122
126
123
127
def get_free_gpus () -> List [int ]:
124
128
"""
@@ -144,7 +148,8 @@ def get_free_gpus() -> List[int]:
144
148
logger .warning ("Failed to get GPU stats, defaulting to GPU 0" )
145
149
return [0 ]
146
150
147
- def chunk_list (original_list :list , t :int ) -> List [list ]:
151
+
152
+ def chunk_list (original_list : list , t : int ) -> List [list ]:
148
153
# chunk the list into sub_lists
149
154
new_list = [original_list [i :i + t ] for i in range (0 , len (original_list ), t )]
150
155
return new_list
@@ -156,12 +161,15 @@ def chunk_list(original_list:list, t:int) -> List[list]:
156
161
from vllm .assets .image import ImageAsset
157
162
from enum import Enum
158
163
import concurrent .futures
164
+
165
+
159
166
class Modality (Enum ):
160
167
IMAGE = "image"
161
168
TEXT = "text"
162
169
VIDEO = "video"
163
170
164
- def get_prompts_qwen (questions : list , modality : Modality ) -> Tuple [List [str ],Optional [List [int ]]]:
171
+
172
+ def get_prompts_qwen (questions : list , modality : Modality ) -> Tuple [List [str ], Optional [List [int ]]]:
165
173
if modality == Modality .IMAGE :
166
174
placeholder = "<|image_pad|>"
167
175
elif modality == Modality .VIDEO :
@@ -179,7 +187,7 @@ def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str],Opt
179
187
) for question in questions
180
188
]
181
189
stop_token_ids = None
182
- return prompts ,stop_token_ids
190
+ return prompts , stop_token_ids
183
191
184
192
185
193
def get_multi_modal_input (modality : Modality , filenames : list , questions : list ) -> dict :
@@ -205,11 +213,11 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
205
213
return ret
206
214
207
215
208
- async def run_vllm_collector (gpu_id :int , prompts :List , model_path :str ,temperature :float ) -> List [str ]:
216
+ async def run_vllm_collector (gpu_id : int , prompts : List , model_path : str , temperature : float ) -> List [str ]:
209
217
# set visible gpu
210
218
os .environ ["CUDA_VISIBLE_DEVICES" ] = str (gpu_id )
211
219
# get a model on a single gpu
212
- model = HuggingFaceModelGenerator (model_path ,free_gpus = [gpu_id ],temperature = temperature )
220
+ model = HuggingFaceModelGenerator (model_path , free_gpus = [gpu_id ], temperature = temperature )
213
221
214
222
responses_list = []
215
223
for prompt in prompts :
@@ -220,21 +228,25 @@ async def run_vllm_collector(gpu_id:int, prompts:List, model_path:str,temperatur
220
228
221
229
return responses_list
222
230
231
+
223
232
import asyncio
224
- def start_collector (gpu_id :int , prompts :list , model_path :str ,temperature :float ) -> List [str ]:
225
- # event loop in a process
226
- results = asyncio .run (run_vllm_collector (gpu_id , prompts , model_path ,temperature ))
233
+
234
+
235
+ def start_collector (gpu_id : int , prompts : list , model_path : str , temperature : float ) -> List [str ]:
236
+ # event loop in a process
237
+ results = asyncio .run (run_vllm_collector (gpu_id , prompts , model_path , temperature ))
227
238
return results
228
239
229
- def main (prompts :list , model_path :str , free_gpus :List [int ],temperature :float ) -> None :
230
- num_tot = len (prompts )
231
- num_gpu = len (free_gpus )
232
- num_per_gpu = num_tot // num_gpu
233
- prompts_per_gpu = chunk_list (prompts ,num_per_gpu )
240
+
241
+ def main (prompts : list , model_path : str , free_gpus : List [int ], temperature : float ) -> None :
242
+ num_tot = len (prompts )
243
+ num_gpu = len (free_gpus )
244
+ num_per_gpu = num_tot // num_gpu
245
+ prompts_per_gpu = chunk_list (prompts , num_per_gpu )
234
246
with concurrent .futures .ProcessPoolExecutor (max_workers = len (free_gpus )) as executor :
235
247
futures = []
236
- for gpu_id ,prompts_gpu in zip (free_gpus ,prompts_per_gpu ):
237
- futures .append (executor .submit (start_collector , gpu_id , prompts_gpu , model_path ,temperature ))
248
+ for gpu_id , prompts_gpu in zip (free_gpus , prompts_per_gpu ):
249
+ futures .append (executor .submit (start_collector , gpu_id , prompts_gpu , model_path , temperature ))
238
250
239
251
# get all results
240
252
all_results = []
@@ -245,23 +257,19 @@ def main(prompts:list, model_path:str, free_gpus:List[int],temperature:float) -
245
257
with open ("/mnt/afs/wangqijian/tests/vllm_multi_gpu.txt" , "w" ) as f :
246
258
for response in all_results :
247
259
f .write (f"{ response } \n " )
248
-
249
-
250
260
251
261
252
262
if __name__ == "__main__" :
253
- questions = ['Please describe the image.' ,'Please describe the image.' ,
254
- 'What\' s the text in the image?' ,'What\' s the text in the image?' ,
255
- 'What is in the image?' ,'What is in the image?' ,
256
- 'How many people are in the image?' ,'How many people are in the image?' ,
257
- 'What is the emotion of the main character of the image?' ,
258
- 'What is the emotion of the main character of the image?' ,
259
- 'How many animals are in the image?' ,
260
- 'How many animals are in the image?' ,
261
- 'What is the place of the image?' ,'What is the place of the image?' ,
262
- 'What is the peroson doing?' ,'What is the peroson doing?'
263
- ]
264
- img_names = [
263
+ questions = [
264
+ 'Please describe the image.' , 'Please describe the image.' , 'What\' s the text in the image?' ,
265
+ 'What\' s the text in the image?' , 'What is in the image?' , 'What is in the image?' ,
266
+ 'How many people are in the image?' , 'How many people are in the image?' ,
267
+ 'What is the emotion of the main character of the image?' ,
268
+ 'What is the emotion of the main character of the image?' , 'How many animals are in the image?' ,
269
+ 'How many animals are in the image?' , 'What is the place of the image?' , 'What is the place of the image?' ,
270
+ 'What is the peroson doing?' , 'What is the peroson doing?'
271
+ ]
272
+ img_names = [
265
273
'/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2127)' ,
266
274
'/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5394)' ,
267
275
'/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1160)' ,
@@ -278,13 +286,13 @@ def main(prompts:list, model_path:str, free_gpus:List[int],temperature:float) -
278
286
'/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2284)' ,
279
287
'/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4533)' ,
280
288
'/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5495)'
281
- ]
282
- free_gpus = get_free_gpus ()
289
+ ]
290
+ free_gpus = get_free_gpus ()
283
291
modality = Modality .IMAGE
284
292
mm_input = get_multi_modal_input (modality , img_names , questions )
285
293
data = mm_input ["data" ]
286
294
question = mm_input ["question" ]
287
295
prompts , stop_token_ids = get_prompts_qwen (question , modality )
288
- model_path = '/mnt/afs/share/Qwen2-VL-7B'
289
- temperature = 0.5
290
- main (prompts ,model_path ,free_gpus ,temperature )
296
+ model_path = '/mnt/afs/share/Qwen2-VL-7B'
297
+ temperature = 0.5
298
+ main (prompts , model_path , free_gpus , temperature )
0 commit comments