Skip to content

Commit 3884779

Browse files
authored
Merge pull request #27 from pytorch/set-benchmark-list
Use our own list of vLLM benchmarks
2 parents 149fcbc + 86a94d8 commit 3884779

File tree

7 files changed

+486
-0
lines changed

7 files changed

+486
-0
lines changed

Diff for: vllm-benchmarks/benchmarks/README.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
This directory mirrors the list of benchmarks from
2+
[vLLM](https://github.com/vllm-project/vllm/tree/main/.buildkite/nightly-benchmarks/tests),
3+
but it includes only models that we want to cover in PyTorch infra.
4+
5+
Another note is that speculative decoding is not yet supported in v1
6+
with the exception of ngram, so its corresponding benchmarks is
7+
currently removed from the list.

Diff for: vllm-benchmarks/benchmarks/genai-perf-tests.json

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[
2+
{
3+
"test_name": "llama8B_tp1_genai_perf",
4+
"qps_list": [4,8,16,32],
5+
"common_parameters": {
6+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
7+
"tp": 1,
8+
"port": 8000,
9+
"num_prompts": 500,
10+
"reuse_server": false
11+
},
12+
"vllm_server_parameters": {
13+
"disable_log_stats": "",
14+
"disable_log_requests": "",
15+
"gpu_memory_utilization": 0.9,
16+
"num_scheduler_steps": 10,
17+
"max_num_seqs": 512,
18+
"dtype": "bfloat16"
19+
},
20+
"genai_perf_input_parameters": {
21+
}
22+
}
23+
]

Diff for: vllm-benchmarks/benchmarks/latency-tests.json

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"parameters": {
5+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
6+
"tensor_parallel_size": 1,
7+
"load_format": "dummy",
8+
"num_iters_warmup": 5,
9+
"num_iters": 15
10+
}
11+
},
12+
{
13+
"test_name": "latency_llama70B_tp4",
14+
"parameters": {
15+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
16+
"tensor_parallel_size": 4,
17+
"load_format": "dummy",
18+
"num-iters-warmup": 5,
19+
"num-iters": 15
20+
}
21+
},
22+
{
23+
"test_name": "latency_mixtral8x7B_tp2",
24+
"parameters": {
25+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
26+
"tensor_parallel_size": 2,
27+
"load_format": "dummy",
28+
"num-iters-warmup": 5,
29+
"num-iters": 15
30+
}
31+
}
32+
]

Diff for: vllm-benchmarks/benchmarks/nightly-tests.json

+323
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
[
2+
{
3+
"test_name": "llama8B_tp1_sharegpt",
4+
"qps_list": [4,8,16,32,"inf"],
5+
"common_parameters": {
6+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
7+
"tp": 1,
8+
"dataset_name": "sharegpt",
9+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
10+
"num_prompts": 500,
11+
"port": 8000,
12+
"reuse_server": false
13+
},
14+
"lmdeploy_server_parameters": {
15+
"dtype": "bfloat16"
16+
},
17+
"lmdeploy_client_parameters": {
18+
},
19+
"tgi_server_parameters": {
20+
},
21+
"tgi_client_parameters": {
22+
"endpoint": "/generate_stream"
23+
},
24+
"trt_server_parameters": {
25+
"model_type": "llama",
26+
"model_dtype": "bfloat16",
27+
"max_batch_size": 2048,
28+
"max_input_len": 4096,
29+
"max_seq_len": 6144,
30+
"max_num_tokens": 16384,
31+
"trt_llm_version": "v0.11.0"
32+
},
33+
"trt_client_parameters": {
34+
"endpoint": "/v2/models/ensemble/generate_stream"
35+
},
36+
"vllm_server_parameters": {
37+
"disable_log_stats": "",
38+
"disable_log_requests": "",
39+
"gpu_memory_utilization": 0.9,
40+
"num_scheduler_steps": 10,
41+
"max_num_seqs": 512,
42+
"dtype": "bfloat16"
43+
},
44+
"vllm_client_parameters": {
45+
},
46+
"sglang_server_parameters": {
47+
"disable_radix_cache": "",
48+
"enable_torch_compile": "",
49+
"dtype": "bfloat16"
50+
},
51+
"sglang_client_parameters": {
52+
}
53+
},
54+
{
55+
"test_name": "llama8B_tp1_sonnet_512_16",
56+
"qps_list": [4,8,16,32,"inf"],
57+
"common_parameters": {
58+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
59+
"tp": 1,
60+
"dataset_name": "sonnet",
61+
"dataset_path": "./sonnet_4x.txt",
62+
"num_prompts": 500,
63+
"port": 8000,
64+
"sonnet_input_len": 512,
65+
"sonnet_output_len": 16,
66+
"sonnet_prefix_len": 50,
67+
"reuse_server": true
68+
},
69+
"lmdeploy_server_parameters": {
70+
"dtype": "bfloat16"
71+
},
72+
"lmdeploy_client_parameters": {
73+
},
74+
"tgi_server_parameters": {
75+
},
76+
"tgi_client_parameters": {
77+
"endpoint": "/generate_stream"
78+
},
79+
"trt_server_parameters": {
80+
"model_type": "llama",
81+
"model_dtype": "bfloat16",
82+
"max_batch_size": 2048,
83+
"max_input_len": 4096,
84+
"max_seq_len": 6144,
85+
"max_num_tokens": 16384,
86+
"trt_llm_version": "v0.11.0"
87+
},
88+
"trt_client_parameters": {
89+
"endpoint": "/v2/models/ensemble/generate_stream"
90+
},
91+
"vllm_server_parameters": {
92+
"disable_log_stats": "",
93+
"disable_log_requests": "",
94+
"gpu_memory_utilization": 0.9,
95+
"num_scheduler_steps": 10,
96+
"max_num_seqs": 512,
97+
"dtype": "bfloat16"
98+
},
99+
"vllm_client_parameters": {
100+
},
101+
"sglang_server_parameters": {
102+
"disable_radix_cache": "",
103+
"enable_torch_compile": "",
104+
"dtype": "bfloat16"
105+
},
106+
"sglang_client_parameters": {
107+
}
108+
},
109+
{
110+
"test_name": "llama8B_tp1_sonnet_512_256",
111+
"qps_list": [4,8,16,32,"inf"],
112+
"common_parameters": {
113+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
114+
"tp": 1,
115+
"dataset_name": "sonnet",
116+
"dataset_path": "./sonnet_4x.txt",
117+
"num_prompts": 500,
118+
"port": 8000,
119+
"sonnet_input_len": 512,
120+
"sonnet_output_len": 256,
121+
"sonnet_prefix_len": 50,
122+
"reuse_server": true
123+
},
124+
"lmdeploy_server_parameters": {
125+
"dtype": "bfloat16"
126+
},
127+
"lmdeploy_client_parameters": {
128+
},
129+
"tgi_server_parameters": {
130+
},
131+
"tgi_client_parameters": {
132+
"endpoint": "/generate_stream"
133+
},
134+
"trt_server_parameters": {
135+
"model_type": "llama",
136+
"model_dtype": "bfloat16",
137+
"max_batch_size": 2048,
138+
"max_input_len": 4096,
139+
"max_seq_len": 6144,
140+
"max_num_tokens": 16384,
141+
"trt_llm_version": "v0.11.0"
142+
},
143+
"trt_client_parameters": {
144+
"endpoint": "/v2/models/ensemble/generate_stream"
145+
},
146+
"vllm_server_parameters": {
147+
"disable_log_stats": "",
148+
"disable_log_requests": "",
149+
"gpu_memory_utilization": 0.9,
150+
"num_scheduler_steps": 10,
151+
"max_num_seqs": 512,
152+
"dtype": "bfloat16"
153+
},
154+
"vllm_client_parameters": {
155+
},
156+
"sglang_server_parameters": {
157+
"disable_radix_cache": "",
158+
"enable_torch_compile": "",
159+
"dtype": "bfloat16"
160+
},
161+
"sglang_client_parameters": {
162+
}
163+
},
164+
{
165+
"test_name": "llama70B_tp4_sharegpt",
166+
"qps_list": [4,8,16,32,"inf"],
167+
"common_parameters": {
168+
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
169+
"tp": 4,
170+
"dataset_name": "sharegpt",
171+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
172+
"num_prompts": 500,
173+
"port": 8000,
174+
"reuse_server": false
175+
},
176+
"lmdeploy_server_parameters": {
177+
"dtype": "bfloat16"
178+
},
179+
"lmdeploy_client_parameters": {
180+
},
181+
"tgi_server_parameters": {
182+
},
183+
"tgi_client_parameters": {
184+
"endpoint": "/generate_stream"
185+
},
186+
"trt_server_parameters": {
187+
"model_type": "llama",
188+
"model_dtype": "bfloat16",
189+
"max_batch_size": 2048,
190+
"max_input_len": 4096,
191+
"max_seq_len": 6144,
192+
"max_num_tokens": 16384,
193+
"trt_llm_version": "v0.11.0"
194+
},
195+
"trt_client_parameters": {
196+
"endpoint": "/v2/models/ensemble/generate_stream"
197+
},
198+
"vllm_server_parameters": {
199+
"disable_log_stats": "",
200+
"disable_log_requests": "",
201+
"gpu_memory_utilization": 0.9,
202+
"num_scheduler_steps": 10,
203+
"max_num_seqs": 512,
204+
"dtype": "bfloat16"
205+
},
206+
"vllm_client_parameters": {
207+
},
208+
"sglang_server_parameters": {
209+
"disable_radix_cache": "",
210+
"dtype": "bfloat16"
211+
},
212+
"sglang_client_parameters": {
213+
}
214+
},
215+
{
216+
"test_name": "llama70B_tp4_sonnet_512_16",
217+
"qps_list": [4,8,16,32,"inf"],
218+
"common_parameters": {
219+
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
220+
"tp": 4,
221+
"dataset_name": "sonnet",
222+
"dataset_path": "./sonnet_4x.txt",
223+
"num_prompts": 500,
224+
"port": 8000,
225+
"sonnet_input_len": 512,
226+
"sonnet_output_len": 16,
227+
"sonnet_prefix_len": 50,
228+
"reuse_server": true
229+
},
230+
"lmdeploy_server_parameters": {
231+
"dtype": "bfloat16"
232+
},
233+
"lmdeploy_client_parameters": {
234+
},
235+
"tgi_server_parameters": {
236+
},
237+
"tgi_client_parameters": {
238+
"endpoint": "/generate_stream"
239+
},
240+
"trt_server_parameters": {
241+
"model_type": "llama",
242+
"model_dtype": "bfloat16",
243+
"max_batch_size": 2048,
244+
"max_input_len": 4096,
245+
"max_seq_len": 6144,
246+
"max_num_tokens": 16384,
247+
"trt_llm_version": "v0.11.0"
248+
},
249+
"trt_client_parameters": {
250+
"endpoint": "/v2/models/ensemble/generate_stream"
251+
},
252+
"vllm_server_parameters": {
253+
"disable_log_stats": "",
254+
"disable_log_requests": "",
255+
"gpu_memory_utilization": 0.9,
256+
"num_scheduler_steps": 10,
257+
"max_num_seqs": 512,
258+
"dtype": "bfloat16"
259+
},
260+
"vllm_client_parameters": {
261+
},
262+
"sglang_server_parameters": {
263+
"disable_radix_cache": "",
264+
"dtype": "bfloat16"
265+
},
266+
"sglang_client_parameters": {
267+
}
268+
},
269+
{
270+
"test_name": "llama70B_tp4_sonnet_512_256",
271+
"qps_list": [4,8,16,32,"inf"],
272+
"common_parameters": {
273+
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
274+
"tp": 4,
275+
"dataset_name": "sonnet",
276+
"dataset_path": "./sonnet_4x.txt",
277+
"num_prompts": 500,
278+
"port": 8000,
279+
"sonnet_input_len": 512,
280+
"sonnet_output_len": 256,
281+
"sonnet_prefix_len": 50,
282+
"reuse_server": true
283+
},
284+
"lmdeploy_server_parameters": {
285+
"dtype": "bfloat16"
286+
},
287+
"lmdeploy_client_parameters": {
288+
},
289+
"tgi_server_parameters": {
290+
},
291+
"tgi_client_parameters": {
292+
"endpoint": "/generate_stream"
293+
},
294+
"trt_server_parameters": {
295+
"model_type": "llama",
296+
"model_dtype": "bfloat16",
297+
"max_batch_size": 2048,
298+
"max_input_len": 4096,
299+
"max_seq_len": 6144,
300+
"max_num_tokens": 16384,
301+
"trt_llm_version": "v0.11.0"
302+
},
303+
"trt_client_parameters": {
304+
"endpoint": "/v2/models/ensemble/generate_stream"
305+
},
306+
"vllm_server_parameters": {
307+
"disable_log_stats": "",
308+
"disable_log_requests": "",
309+
"gpu_memory_utilization": 0.9,
310+
"num_scheduler_steps": 10,
311+
"max_num_seqs": 512,
312+
"dtype": "bfloat16"
313+
},
314+
"vllm_client_parameters": {
315+
},
316+
"sglang_server_parameters": {
317+
"disable_radix_cache": "",
318+
"dtype": "bfloat16"
319+
},
320+
"sglang_client_parameters": {
321+
}
322+
}
323+
]

0 commit comments

Comments
 (0)