Skip to content

Commit

Permalink
Performance summary accounting for the case of LLM benchmarks (#62)
Browse files Browse the repository at this point in the history
* Initial commit

* Code clean up

* Add round up to two decimals for ttft/tpot cutoff ratio

* Remove unnecessary variables

* Further code simplification

* Optimization end fixes
  • Loading branch information
kaidrake authored Feb 3, 2025
1 parent ef3774a commit 553375a
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 40 deletions.
45 changes: 23 additions & 22 deletions base_loadgen_experiment/code_axs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def beautify_summary(parsed_summary):

ureg = UnitRegistry()

linked_keys = ["latency", "Early_stopping_9", "duration"]
linked_keys = ["latency", "Early_stopping_9", "duration", "time_to_output_token"]

beautified_summary = {}
kv_with_units = {}
Expand Down Expand Up @@ -65,14 +65,12 @@ def beautify_summary(parsed_summary):

return beautified_summary

def calc_latency_cutoff_ratio(parsed_summary, latency, target_latency):

def calc_latency_cutoff_ratio(parsed_summary):

scenario = parsed_summary["Scenario"]

if scenario == "Server":
return parsed_summary["99.00_percentile_latency_ns"]/parsed_summary["target_latency_ns"]

if target_latency in parsed_summary:
return parsed_summary[latency]/parsed_summary[target_latency]

def calc_early_stopping_overhead(parsed_summary):

Expand All @@ -85,34 +83,37 @@ def calc_early_stopping_overhead(parsed_summary):


#returns list of formatted performance metrics (as strings) for given experiment
def parse_performance(beautified_summary, latency_cutoff_ratio, early_stopping_overhead, scenario_performance_map, raw=False):
def parse_performance(parsed_summary, beautified_summary, early_stopping_overhead, scenario_performance_map, raw=False):

scenario = beautified_summary["Scenario"]
validity = beautified_summary["Result_is"]

if raw and validity == "INVALID":
return None

performance_metrics = scenario_performance_map[scenario][validity]
performance_metrics = scenario_performance_map[scenario]
formatted_performance_metrics = ['{}'.format(validity)] # set first element

for key_name in performance_metrics:

if raw:
if key_name == "latency_cutoff_ratio":
formatted_performance_metrics.append(latency_cutoff_ratio)
elif key_name == "early_stopping_overhead":
formatted_performance_metrics.append(early_stopping_overhead)
else:
formatted_performance_metrics.append(beautified_summary[key_name])

else: #no need for multiplier, formatting, units in scenario_performance_map - the beautify_summary function does all of this already
if key_name == "latency_cutoff_ratio":
formatted_performance_metrics.append('{}={:.2f}'.format(key_name, latency_cutoff_ratio))
elif key_name == "early_stopping_overhead":
formatted_performance_metrics.append('{}={:.2f}{}'.format(key_name, early_stopping_overhead, "%"))
if key_name == "latency_cutoff_ratio":
fmt, value = '{}={:.2f}', calc_latency_cutoff_ratio(parsed_summary, "99.00_percentile_latency_ns", "target_latency_ns")
elif key_name == "cutoff_ratio_ttft":
fmt, value = '{}={:.2f}', calc_latency_cutoff_ratio(parsed_summary, "99.00_percentile_first_token_latency_ns", "ttft_latency_ns")
elif key_name == "cutoff_ratio_tpot":
fmt, value = '{}={:.2f}', calc_latency_cutoff_ratio(parsed_summary, "99.00_percentile_time_to_output_token_ns", "tpot_latency_ns")
elif key_name == "early_stopping_overhead":
fmt, value = '{}={:.2f}%', early_stopping_overhead
elif key_name in beautified_summary:
fmt, value = '{}={}', beautified_summary[key_name]
else:
value = None

if value is not None:
if raw:
formatted_performance_metrics.append( value )
else:
formatted_performance_metrics.append('{}={}'.format(key_name, beautified_summary[key_name]))
formatted_performance_metrics.append( fmt.format(key_name, value) )

return formatted_performance_metrics

Expand Down
22 changes: 4 additions & 18 deletions base_loadgen_experiment/data_axs.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,13 @@

"beautified_summary": [ "^^", "beautify_summary" ],

"latency_cutoff_ratio": [ "^^", "calc_latency_cutoff_ratio" ],

"early_stopping_overhead": [ "^^", "calc_early_stopping_overhead" ],

"scenario_performance_map": {
"Offline": {
"VALID": ["Samples_per_second", "target_qps"],
"INVALID": ["Samples_per_second", "target_qps"]
},
"SingleStream": {
"VALID": ["90th_percentile_latency", "_Early_stopping_90th_percentile_estimate", "early_stopping_overhead"],
"INVALID": ["90th_percentile_latency", "_Early_stopping_90th_percentile_estimate", "early_stopping_overhead"]
},
"MultiStream": {
"VALID": ["99th_percentile_latency", "_Early_stopping_99th_percentile_estimate", "early_stopping_overhead"],
"INVALID": ["99th_percentile_latency", "_Early_stopping_99th_percentile_estimate", "early_stopping_overhead"]
},
"Server": {
"VALID": ["target_qps", "99.00_percentile_latency", "target_latency", "latency_cutoff_ratio", "Completed_samples_per_second"],
"INVALID": ["target_qps", "99.00_percentile_latency", "target_latency", "latency_cutoff_ratio", "Completed_samples_per_second"]
}
"Offline": ["Samples_per_second", "target_qps"],
"SingleStream": ["90th_percentile_latency", "_Early_stopping_90th_percentile_estimate", "early_stopping_overhead"],
"MultiStream": ["99th_percentile_latency", "_Early_stopping_99th_percentile_estimate", "early_stopping_overhead"],
"Server": ["target_qps", "99.00_percentile_latency", "target_latency", "latency_cutoff_ratio", "99.00_percentile_first_token_latency", "ttft_latency", "cutoff_ratio_ttft", "99.00_percentile_time_to_output_token", "tpot_latency", "cutoff_ratio_tpot", "Completed_samples_per_second"]
},

"performance": ["^^", "parse_performance"],
Expand Down

0 comments on commit 553375a

Please sign in to comment.