1
+ [
2
+ {
3
+ "test_name" : " llama8B_tp1_sharegpt" ,
4
+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
5
+ "common_parameters" : {
6
+ "model" : " meta-llama/Meta-Llama-3-8B-Instruct" ,
7
+ "tp" : 1 ,
8
+ "dataset_name" : " sharegpt" ,
9
+ "dataset_path" : " ./ShareGPT_V3_unfiltered_cleaned_split.json" ,
10
+ "num_prompts" : 500 ,
11
+ "port" : 8000 ,
12
+ "reuse_server" : false
13
+ },
14
+ "lmdeploy_server_parameters" : {
15
+ "dtype" : " bfloat16"
16
+ },
17
+ "lmdeploy_client_parameters" : {
18
+ },
19
+ "tgi_server_parameters" : {
20
+ },
21
+ "tgi_client_parameters" : {
22
+ "endpoint" : " /generate_stream"
23
+ },
24
+ "trt_server_parameters" : {
25
+ "model_type" : " llama" ,
26
+ "model_dtype" : " bfloat16" ,
27
+ "max_batch_size" : 2048 ,
28
+ "max_input_len" : 4096 ,
29
+ "max_seq_len" : 6144 ,
30
+ "max_num_tokens" : 16384 ,
31
+ "trt_llm_version" : " v0.11.0"
32
+ },
33
+ "trt_client_parameters" : {
34
+ "endpoint" : " /v2/models/ensemble/generate_stream"
35
+ },
36
+ "vllm_server_parameters" : {
37
+ "disable_log_stats" : " " ,
38
+ "disable_log_requests" : " " ,
39
+ "gpu_memory_utilization" : 0.9 ,
40
+ "num_scheduler_steps" : 10 ,
41
+ "max_num_seqs" : 512 ,
42
+ "dtype" : " bfloat16"
43
+ },
44
+ "vllm_client_parameters" : {
45
+ },
46
+ "sglang_server_parameters" : {
47
+ "disable_radix_cache" : " " ,
48
+ "enable_torch_compile" : " " ,
49
+ "dtype" : " bfloat16"
50
+ },
51
+ "sglang_client_parameters" : {
52
+ }
53
+ },
54
+ {
55
+ "test_name" : " llama8B_tp1_sonnet_512_16" ,
56
+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
57
+ "common_parameters" : {
58
+ "model" : " meta-llama/Meta-Llama-3-8B-Instruct" ,
59
+ "tp" : 1 ,
60
+ "dataset_name" : " sonnet" ,
61
+ "dataset_path" : " ./sonnet_4x.txt" ,
62
+ "num_prompts" : 500 ,
63
+ "port" : 8000 ,
64
+ "sonnet_input_len" : 512 ,
65
+ "sonnet_output_len" : 16 ,
66
+ "sonnet_prefix_len" : 50 ,
67
+ "reuse_server" : true
68
+ },
69
+ "lmdeploy_server_parameters" : {
70
+ "dtype" : " bfloat16"
71
+ },
72
+ "lmdeploy_client_parameters" : {
73
+ },
74
+ "tgi_server_parameters" : {
75
+ },
76
+ "tgi_client_parameters" : {
77
+ "endpoint" : " /generate_stream"
78
+ },
79
+ "trt_server_parameters" : {
80
+ "model_type" : " llama" ,
81
+ "model_dtype" : " bfloat16" ,
82
+ "max_batch_size" : 2048 ,
83
+ "max_input_len" : 4096 ,
84
+ "max_seq_len" : 6144 ,
85
+ "max_num_tokens" : 16384 ,
86
+ "trt_llm_version" : " v0.11.0"
87
+ },
88
+ "trt_client_parameters" : {
89
+ "endpoint" : " /v2/models/ensemble/generate_stream"
90
+ },
91
+ "vllm_server_parameters" : {
92
+ "disable_log_stats" : " " ,
93
+ "disable_log_requests" : " " ,
94
+ "gpu_memory_utilization" : 0.9 ,
95
+ "num_scheduler_steps" : 10 ,
96
+ "max_num_seqs" : 512 ,
97
+ "dtype" : " bfloat16"
98
+ },
99
+ "vllm_client_parameters" : {
100
+ },
101
+ "sglang_server_parameters" : {
102
+ "disable_radix_cache" : " " ,
103
+ "enable_torch_compile" : " " ,
104
+ "dtype" : " bfloat16"
105
+ },
106
+ "sglang_client_parameters" : {
107
+ }
108
+ },
109
+ {
110
+ "test_name" : " llama8B_tp1_sonnet_512_256" ,
111
+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
112
+ "common_parameters" : {
113
+ "model" : " meta-llama/Meta-Llama-3-8B-Instruct" ,
114
+ "tp" : 1 ,
115
+ "dataset_name" : " sonnet" ,
116
+ "dataset_path" : " ./sonnet_4x.txt" ,
117
+ "num_prompts" : 500 ,
118
+ "port" : 8000 ,
119
+ "sonnet_input_len" : 512 ,
120
+ "sonnet_output_len" : 256 ,
121
+ "sonnet_prefix_len" : 50 ,
122
+ "reuse_server" : true
123
+ },
124
+ "lmdeploy_server_parameters" : {
125
+ "dtype" : " bfloat16"
126
+ },
127
+ "lmdeploy_client_parameters" : {
128
+ },
129
+ "tgi_server_parameters" : {
130
+ },
131
+ "tgi_client_parameters" : {
132
+ "endpoint" : " /generate_stream"
133
+ },
134
+ "trt_server_parameters" : {
135
+ "model_type" : " llama" ,
136
+ "model_dtype" : " bfloat16" ,
137
+ "max_batch_size" : 2048 ,
138
+ "max_input_len" : 4096 ,
139
+ "max_seq_len" : 6144 ,
140
+ "max_num_tokens" : 16384 ,
141
+ "trt_llm_version" : " v0.11.0"
142
+ },
143
+ "trt_client_parameters" : {
144
+ "endpoint" : " /v2/models/ensemble/generate_stream"
145
+ },
146
+ "vllm_server_parameters" : {
147
+ "disable_log_stats" : " " ,
148
+ "disable_log_requests" : " " ,
149
+ "gpu_memory_utilization" : 0.9 ,
150
+ "num_scheduler_steps" : 10 ,
151
+ "max_num_seqs" : 512 ,
152
+ "dtype" : " bfloat16"
153
+ },
154
+ "vllm_client_parameters" : {
155
+ },
156
+ "sglang_server_parameters" : {
157
+ "disable_radix_cache" : " " ,
158
+ "enable_torch_compile" : " " ,
159
+ "dtype" : " bfloat16"
160
+ },
161
+ "sglang_client_parameters" : {
162
+ }
163
+ },
164
+ {
165
+ "test_name" : " llama70B_tp4_sharegpt" ,
166
+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
167
+ "common_parameters" : {
168
+ "model" : " meta-llama/Meta-Llama-3-70B-Instruct" ,
169
+ "tp" : 4 ,
170
+ "dataset_name" : " sharegpt" ,
171
+ "dataset_path" : " ./ShareGPT_V3_unfiltered_cleaned_split.json" ,
172
+ "num_prompts" : 500 ,
173
+ "port" : 8000 ,
174
+ "reuse_server" : false
175
+ },
176
+ "lmdeploy_server_parameters" : {
177
+ "dtype" : " bfloat16"
178
+ },
179
+ "lmdeploy_client_parameters" : {
180
+ },
181
+ "tgi_server_parameters" : {
182
+ },
183
+ "tgi_client_parameters" : {
184
+ "endpoint" : " /generate_stream"
185
+ },
186
+ "trt_server_parameters" : {
187
+ "model_type" : " llama" ,
188
+ "model_dtype" : " bfloat16" ,
189
+ "max_batch_size" : 2048 ,
190
+ "max_input_len" : 4096 ,
191
+ "max_seq_len" : 6144 ,
192
+ "max_num_tokens" : 16384 ,
193
+ "trt_llm_version" : " v0.11.0"
194
+ },
195
+ "trt_client_parameters" : {
196
+ "endpoint" : " /v2/models/ensemble/generate_stream"
197
+ },
198
+ "vllm_server_parameters" : {
199
+ "disable_log_stats" : " " ,
200
+ "disable_log_requests" : " " ,
201
+ "gpu_memory_utilization" : 0.9 ,
202
+ "num_scheduler_steps" : 10 ,
203
+ "max_num_seqs" : 512 ,
204
+ "dtype" : " bfloat16"
205
+ },
206
+ "vllm_client_parameters" : {
207
+ },
208
+ "sglang_server_parameters" : {
209
+ "disable_radix_cache" : " " ,
210
+ "dtype" : " bfloat16"
211
+ },
212
+ "sglang_client_parameters" : {
213
+ }
214
+ },
215
+ {
216
+ "test_name" : " llama70B_tp4_sonnet_512_16" ,
217
+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
218
+ "common_parameters" : {
219
+ "model" : " meta-llama/Meta-Llama-3-70B-Instruct" ,
220
+ "tp" : 4 ,
221
+ "dataset_name" : " sonnet" ,
222
+ "dataset_path" : " ./sonnet_4x.txt" ,
223
+ "num_prompts" : 500 ,
224
+ "port" : 8000 ,
225
+ "sonnet_input_len" : 512 ,
226
+ "sonnet_output_len" : 16 ,
227
+ "sonnet_prefix_len" : 50 ,
228
+ "reuse_server" : true
229
+ },
230
+ "lmdeploy_server_parameters" : {
231
+ "dtype" : " bfloat16"
232
+ },
233
+ "lmdeploy_client_parameters" : {
234
+ },
235
+ "tgi_server_parameters" : {
236
+ },
237
+ "tgi_client_parameters" : {
238
+ "endpoint" : " /generate_stream"
239
+ },
240
+ "trt_server_parameters" : {
241
+ "model_type" : " llama" ,
242
+ "model_dtype" : " bfloat16" ,
243
+ "max_batch_size" : 2048 ,
244
+ "max_input_len" : 4096 ,
245
+ "max_seq_len" : 6144 ,
246
+ "max_num_tokens" : 16384 ,
247
+ "trt_llm_version" : " v0.11.0"
248
+ },
249
+ "trt_client_parameters" : {
250
+ "endpoint" : " /v2/models/ensemble/generate_stream"
251
+ },
252
+ "vllm_server_parameters" : {
253
+ "disable_log_stats" : " " ,
254
+ "disable_log_requests" : " " ,
255
+ "gpu_memory_utilization" : 0.9 ,
256
+ "num_scheduler_steps" : 10 ,
257
+ "max_num_seqs" : 512 ,
258
+ "dtype" : " bfloat16"
259
+ },
260
+ "vllm_client_parameters" : {
261
+ },
262
+ "sglang_server_parameters" : {
263
+ "disable_radix_cache" : " " ,
264
+ "dtype" : " bfloat16"
265
+ },
266
+ "sglang_client_parameters" : {
267
+ }
268
+ },
269
+ {
270
+ "test_name" : " llama70B_tp4_sonnet_512_256" ,
271
+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
272
+ "common_parameters" : {
273
+ "model" : " meta-llama/Meta-Llama-3-70B-Instruct" ,
274
+ "tp" : 4 ,
275
+ "dataset_name" : " sonnet" ,
276
+ "dataset_path" : " ./sonnet_4x.txt" ,
277
+ "num_prompts" : 500 ,
278
+ "port" : 8000 ,
279
+ "sonnet_input_len" : 512 ,
280
+ "sonnet_output_len" : 256 ,
281
+ "sonnet_prefix_len" : 50 ,
282
+ "reuse_server" : true
283
+ },
284
+ "lmdeploy_server_parameters" : {
285
+ "dtype" : " bfloat16"
286
+ },
287
+ "lmdeploy_client_parameters" : {
288
+ },
289
+ "tgi_server_parameters" : {
290
+ },
291
+ "tgi_client_parameters" : {
292
+ "endpoint" : " /generate_stream"
293
+ },
294
+ "trt_server_parameters" : {
295
+ "model_type" : " llama" ,
296
+ "model_dtype" : " bfloat16" ,
297
+ "max_batch_size" : 2048 ,
298
+ "max_input_len" : 4096 ,
299
+ "max_seq_len" : 6144 ,
300
+ "max_num_tokens" : 16384 ,
301
+ "trt_llm_version" : " v0.11.0"
302
+ },
303
+ "trt_client_parameters" : {
304
+ "endpoint" : " /v2/models/ensemble/generate_stream"
305
+ },
306
+ "vllm_server_parameters" : {
307
+ "disable_log_stats" : " " ,
308
+ "disable_log_requests" : " " ,
309
+ "gpu_memory_utilization" : 0.9 ,
310
+ "num_scheduler_steps" : 10 ,
311
+ "max_num_seqs" : 512 ,
312
+ "dtype" : " bfloat16"
313
+ },
314
+ "vllm_client_parameters" : {
315
+ },
316
+ "sglang_server_parameters" : {
317
+ "disable_radix_cache" : " " ,
318
+ "dtype" : " bfloat16"
319
+ },
320
+ "sglang_client_parameters" : {
321
+ }
322
+ }
323
+ ]
0 commit comments