@@ -304,6 +304,7 @@ class Recorder
304
304
std::vector<float > reqLatencies;
305
305
std::vector<float > ftLatencies;
306
306
std::vector<float > genT2TLatencies;
307
+ std::vector<float > userTokensPerSecond;
307
308
308
309
int totalOutputTokens{0 };
309
310
int totalDecodingIter{0 };
@@ -325,6 +326,10 @@ class Recorder
325
326
{
326
327
genT2TLatencies.push_back (reqInfo.second .avgGenT2TLatency .value ());
327
328
}
329
+ if (reqInfo.second .avgGenT2TLatency .value () > 0 )
330
+ {
331
+ userTokensPerSecond.push_back (1000 .F / reqInfo.second .avgGenT2TLatency .value ());
332
+ }
328
333
}
329
334
++mNumSamples ;
330
335
}
@@ -377,6 +382,18 @@ class Recorder
377
382
mMinGenT2TLatency = genT2TLatencies.front ();
378
383
}
379
384
385
+ if (!userTokensPerSecond.empty ())
386
+ {
387
+ mAvgUserTokensPerSecond = std::accumulate (userTokensPerSecond.begin (), userTokensPerSecond.end (), 0 .F )
388
+ / userTokensPerSecond.size ();
389
+ std::sort (userTokensPerSecond.begin (), userTokensPerSecond.end ());
390
+ mP99UserTokensPerSecond = calcPercentile (userTokensPerSecond, 99 );
391
+ mP90UserTokensPerSecond = calcPercentile (userTokensPerSecond, 90 );
392
+ mP50UserTokensPerSecond = calcPercentile (userTokensPerSecond, 50 );
393
+ mMaxUserTokensPerSecond = userTokensPerSecond.back ();
394
+ mMinUserTokensPerSecond = userTokensPerSecond.front ();
395
+ }
396
+
380
397
mAvgReqQueueingLatency
381
398
= std::accumulate (mRequestsQueueingLatencies .begin (), mRequestsQueueingLatencies .end (), 0 .F )
382
399
/ mRequestsQueueingLatencies .size ();
@@ -423,6 +440,13 @@ class Recorder
423
440
printf (" [BENCHMARK] p90_inter_token_latency(ms) %.2f\n " , mP90GenT2TLatency );
424
441
printf (" [BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n " , mP50GenT2TLatency );
425
442
443
+ printf (" [BENCHMARK] avg_user_tokens_per_second(tokens/sec/user) %.2f\n " , mAvgUserTokensPerSecond );
444
+ printf (" [BENCHMARK] max_user_tokens_per_second(tokens/sec/user) %.2f\n " , mMaxUserTokensPerSecond );
445
+ printf (" [BENCHMARK] min_user_tokens_per_second(tokens/sec/user) %.2f\n " , mMinUserTokensPerSecond );
446
+ printf (" [BENCHMARK] p99_user_tokens_per_second(tokens/sec/user) %.2f\n " , mP99UserTokensPerSecond );
447
+ printf (" [BENCHMARK] p90_user_tokens_per_second(tokens/sec/user) %.2f\n " , mP90UserTokensPerSecond );
448
+ printf (" [BENCHMARK] p50_user_tokens_per_second(tokens/sec/user) %.2f\n\n " , mP50UserTokensPerSecond );
449
+
426
450
printf (" [BENCHMARK] avg_request_queueing_latency(ms) %.2f\n " , mAvgReqQueueingLatency );
427
451
printf (" [BENCHMARK] max_request_queueing_latency(ms) %.2f\n " , mMaxReqQueueingLatency );
428
452
printf (" [BENCHMARK] min_request_queueing_latency(ms) %.2f\n " , mMinReqQueueingLatency );
@@ -443,11 +467,26 @@ class Recorder
443
467
444
468
if (mStreaming )
445
469
{
446
- std::vector<std::string> streamingHeaders
447
- = {" avg_time_to_first_token(ms)" , " max_time_to_first_token(ms)" , " min_time_to_first_token(ms)" ,
448
- " p99_time_to_first_token(ms)" , " p90_time_to_first_token(ms)" , " p50_time_to_first_token(ms)" ,
449
- " avg_inter_token_latency(ms)" , " max_inter_token_latency(ms)" , " min_inter_token_latency(ms)" ,
450
- " p99_inter_token_latency(ms)" , " p90_inter_token_latency(ms)" , " p50_inter_token_latency(ms)" };
470
+ std::vector<std::string> streamingHeaders = {
471
+ " avg_time_to_first_token(ms)" ,
472
+ " max_time_to_first_token(ms)" ,
473
+ " min_time_to_first_token(ms)" ,
474
+ " p99_time_to_first_token(ms)" ,
475
+ " p90_time_to_first_token(ms)" ,
476
+ " p50_time_to_first_token(ms)" ,
477
+ " avg_inter_token_latency(ms)" ,
478
+ " max_inter_token_latency(ms)" ,
479
+ " min_inter_token_latency(ms)" ,
480
+ " p99_inter_token_latency(ms)" ,
481
+ " p90_inter_token_latency(ms)" ,
482
+ " p50_inter_token_latency(ms)" ,
483
+ " avg_user_tokens_per_second(tokens/sec/user)" ,
484
+ " max_user_tokens_per_second(tokens/sec/user)" ,
485
+ " min_user_tokens_per_second(tokens/sec/user)" ,
486
+ " p99_user_tokens_per_second(tokens/sec/user)" ,
487
+ " p90_user_tokens_per_second(tokens/sec/user)" ,
488
+ " p50_user_tokens_per_second(tokens/sec/user)" ,
489
+ };
451
490
452
491
headers.insert (headers.end (), streamingHeaders.begin (), streamingHeaders.end ());
453
492
}
@@ -470,7 +509,10 @@ class Recorder
470
509
outputFile << " ," << mAvgFtLatency << " ," << mMaxFtLatency << " ," << mMinFtLatency << " ,"
471
510
<< mP99FtLatency << " ," << mP90FtLatency << " ," << mP50FtLatency << " ,"
472
511
<< mAvgGenT2TLatency << " ," << mMaxGenT2TLatency << " ," << mMinGenT2TLatency << " ,"
473
- << mP99GenT2TLatency << " ," << mP90GenT2TLatency << " ," << mP50GenT2TLatency ;
512
+ << mP99GenT2TLatency << " ," << mP90GenT2TLatency << " ," << mP50GenT2TLatency << " ,"
513
+ << mAvgUserTokensPerSecond << " ," << mMaxUserTokensPerSecond << " ,"
514
+ << mMinUserTokensPerSecond << " ," << mP99UserTokensPerSecond << " ,"
515
+ << mP90UserTokensPerSecond << " ," << mP50UserTokensPerSecond << " ," ;
474
516
}
475
517
476
518
outputFile << " \n " ;
@@ -524,6 +566,7 @@ class Recorder
524
566
float mSeqThroughput {};
525
567
float mAvgSeqLatency {};
526
568
float mAvgGenT2TLatency {};
569
+ float mAvgUserTokensPerSecond {};
527
570
float mAvgFtLatency {};
528
571
float mTokenThroughput {};
529
572
float mAcceptanceRate {};
@@ -542,6 +585,11 @@ class Recorder
542
585
float mP50GenT2TLatency {};
543
586
float mMaxGenT2TLatency {};
544
587
float mMinGenT2TLatency {};
588
+ float mP99UserTokensPerSecond {};
589
+ float mP90UserTokensPerSecond {};
590
+ float mP50UserTokensPerSecond {};
591
+ float mMaxUserTokensPerSecond {};
592
+ float mMinUserTokensPerSecond {};
545
593
float mAvgReqQueueingLatency {};
546
594
float mP99ReqQueueingLatency {};
547
595
float mP90ReqQueueingLatency {};
@@ -1054,7 +1102,7 @@ int main(int argc, char* argv[])
1054
1102
" Operate in streaming mode. Note: it reflects time-to-first-token and inter-token-latency" ,
1055
1103
cxxopts::value<bool >()->default_value (" false" ));
1056
1104
options.add_options ()(
1057
- " enable_kv_cache_reuse" , " Enables the KV cache reuse." , cxxopts::value<bool >()->default_value (" false " ));
1105
+ " enable_kv_cache_reuse" , " Enables the KV cache reuse." , cxxopts::value<bool >()->default_value (" true " ));
1058
1106
options.add_options ()(
1059
1107
" enable_chunked_context" , " Whether to enable context chunking." , cxxopts::value<bool >()->default_value (" true" ));
1060
1108
options.add_options ()(
@@ -1096,6 +1144,11 @@ int main(int argc, char* argv[])
1096
1144
" Minimum token probability threshold for typical acceptance. Enables typical acceptance in Eagle" ,
1097
1145
cxxopts::value<float >());
1098
1146
options.add_options ()(" temperature" , " Sampling temperature for each request" , cxxopts::value<float >());
1147
+ options.add_options ()(
1148
+ " eagle_use_dynamic_tree" , " Whether to use Eagle-2" , cxxopts::value<bool >()->default_value (" false" ));
1149
+ options.add_options ()(" eagle_dynamic_tree_max_top_k" ,
1150
+ " The max topK for dynamic tree, also the number of draft tokens that will expand for each node" ,
1151
+ cxxopts::value<SizeType32>());
1099
1152
1100
1153
options.add_options ()(" multi_block_mode" ,
1101
1154
" Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel" ,
@@ -1305,7 +1358,8 @@ int main(int argc, char* argv[])
1305
1358
benchmarkParams.medusaChoices = parseVectorOfVectors (result[" medusa_choices" ].as <std::string>());
1306
1359
}
1307
1360
// Argument: Eagle choices for the Eagle speculative decoding.
1308
- if (result.count (" eagle_choices" ) || result.count (" eagle_posterior_threshold" ))
1361
+ if (result.count (" eagle_choices" ) || result.count (" eagle_posterior_threshold" )
1362
+ || result.count (" eagle_use_dynamic_tree" ) || result.count (" eagle_dynamic_tree_max_top_k" ))
1309
1363
{
1310
1364
std::optional<float > posteriorThreshold;
1311
1365
if (result.count (" eagle_posterior_threshold" ))
@@ -1317,7 +1371,18 @@ int main(int argc, char* argv[])
1317
1371
{
1318
1372
choices = parseVectorOfVectors (result[" eagle_choices" ].as <std::string>());
1319
1373
}
1320
- benchmarkParams.eagleConfig = texec::EagleConfig (choices, !posteriorThreshold.has_value (), posteriorThreshold);
1374
+ bool eagleUseDynamicTree = false ;
1375
+ if (result.count (" eagle_use_dynamic_tree" ))
1376
+ {
1377
+ eagleUseDynamicTree = result[" eagle_use_dynamic_tree" ].as <bool >();
1378
+ }
1379
+ std::optional<SizeType32> eagleDynamicTreeMaxTopK;
1380
+ if (result.count (" eagle_dynamic_tree_max_top_k" ))
1381
+ {
1382
+ eagleDynamicTreeMaxTopK = result[" eagle_dynamic_tree_max_top_k" ].as <SizeType32>();
1383
+ }
1384
+ benchmarkParams.eagleConfig = texec::EagleConfig (
1385
+ choices, !posteriorThreshold.has_value (), posteriorThreshold, eagleUseDynamicTree, eagleDynamicTreeMaxTopK);
1321
1386
}
1322
1387
if (result.count (" temperature" ))
1323
1388
{
0 commit comments