Performance summary accounting for the case of LLM benchmarks (#62)

* Initial commit * Code clean up * Add round up to two decimals for ttft/tpot cutoff ratio * Remove unnecessary variables * Further code simplification * Optimization end fixes
krai · Feb 3, 2025 · 553375a · 553375a
1 parent ef3774a
commit 553375a
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 40 deletions.
diff --git a/base_loadgen_experiment/code_axs.py b/base_loadgen_experiment/code_axs.py
@@ -22,7 +22,7 @@ def beautify_summary(parsed_summary):
 
     ureg = UnitRegistry()
 
-    linked_keys = ["latency", "Early_stopping_9", "duration"]
+    linked_keys = ["latency", "Early_stopping_9", "duration", "time_to_output_token"]
 
     beautified_summary = {}
     kv_with_units = {}
@@ -65,14 +65,12 @@ def beautify_summary(parsed_summary):
 
     return beautified_summary
 
+def calc_latency_cutoff_ratio(parsed_summary, latency, target_latency):
 
-def calc_latency_cutoff_ratio(parsed_summary):
-
     scenario = parsed_summary["Scenario"]
-
     if scenario == "Server":
-        return parsed_summary["99.00_percentile_latency_ns"]/parsed_summary["target_latency_ns"]
-
+        if target_latency in parsed_summary:
+            return parsed_summary[latency]/parsed_summary[target_latency]
 
 def calc_early_stopping_overhead(parsed_summary):
 
@@ -85,34 +83,37 @@ def calc_early_stopping_overhead(parsed_summary):
 
 
 #returns list of formatted performance metrics (as strings) for given experiment  
-def parse_performance(beautified_summary, latency_cutoff_ratio, early_stopping_overhead, scenario_performance_map, raw=False):
+def parse_performance(parsed_summary, beautified_summary, early_stopping_overhead, scenario_performance_map, raw=False):
 
     scenario = beautified_summary["Scenario"]
     validity = beautified_summary["Result_is"]
 
     if raw and validity == "INVALID":
         return None  
 
-    performance_metrics = scenario_performance_map[scenario][validity] 
+    performance_metrics = scenario_performance_map[scenario]
     formatted_performance_metrics = ['{}'.format(validity)] # set first element 
 
     for key_name in performance_metrics:
 
-        if raw:
-            if key_name == "latency_cutoff_ratio":
-                formatted_performance_metrics.append(latency_cutoff_ratio)
-            elif key_name == "early_stopping_overhead":
-                formatted_performance_metrics.append(early_stopping_overhead)
-            else:
-                formatted_performance_metrics.append(beautified_summary[key_name])
-
-        else: #no need for multiplier, formatting, units in scenario_performance_map - the beautify_summary function does all of this already 
-            if key_name == "latency_cutoff_ratio":
-                formatted_performance_metrics.append('{}={:.2f}'.format(key_name, latency_cutoff_ratio))
-            elif key_name == "early_stopping_overhead":
-                formatted_performance_metrics.append('{}={:.2f}{}'.format(key_name, early_stopping_overhead, "%"))
+        if key_name == "latency_cutoff_ratio":
+            fmt, value = '{}={:.2f}', calc_latency_cutoff_ratio(parsed_summary, "99.00_percentile_latency_ns", "target_latency_ns")
+        elif key_name == "cutoff_ratio_ttft":
+            fmt, value = '{}={:.2f}', calc_latency_cutoff_ratio(parsed_summary, "99.00_percentile_first_token_latency_ns", "ttft_latency_ns")
+        elif key_name == "cutoff_ratio_tpot":
+            fmt, value = '{}={:.2f}', calc_latency_cutoff_ratio(parsed_summary, "99.00_percentile_time_to_output_token_ns", "tpot_latency_ns")
+        elif key_name == "early_stopping_overhead":
+            fmt, value = '{}={:.2f}%', early_stopping_overhead
+        elif key_name in beautified_summary:
+            fmt, value = '{}={}', beautified_summary[key_name]
+        else:
+            value = None
+
+        if value is not None:
+            if raw:
+                formatted_performance_metrics.append( value )
             else:
-                formatted_performance_metrics.append('{}={}'.format(key_name, beautified_summary[key_name]))
+                formatted_performance_metrics.append( fmt.format(key_name, value) )
 
     return formatted_performance_metrics
 

diff --git a/base_loadgen_experiment/data_axs.json b/base_loadgen_experiment/data_axs.json
@@ -13,27 +13,13 @@
 
     "beautified_summary": [ "^^", "beautify_summary" ],
 
-    "latency_cutoff_ratio": [ "^^", "calc_latency_cutoff_ratio" ],
-
     "early_stopping_overhead": [ "^^", "calc_early_stopping_overhead" ],
 
     "scenario_performance_map": {
-        "Offline":      {
-            "VALID":    ["Samples_per_second", "target_qps"],
-            "INVALID":  ["Samples_per_second", "target_qps"]
-        },
-        "SingleStream": {
-            "VALID":    ["90th_percentile_latency", "_Early_stopping_90th_percentile_estimate", "early_stopping_overhead"],
-            "INVALID":  ["90th_percentile_latency", "_Early_stopping_90th_percentile_estimate", "early_stopping_overhead"]
-        },
-        "MultiStream":  {
-            "VALID":    ["99th_percentile_latency", "_Early_stopping_99th_percentile_estimate", "early_stopping_overhead"],
-            "INVALID":  ["99th_percentile_latency", "_Early_stopping_99th_percentile_estimate", "early_stopping_overhead"]
-        },
-        "Server":       {
-            "VALID":    ["target_qps", "99.00_percentile_latency", "target_latency", "latency_cutoff_ratio", "Completed_samples_per_second"],
-            "INVALID":  ["target_qps", "99.00_percentile_latency", "target_latency", "latency_cutoff_ratio", "Completed_samples_per_second"]
-        }
+        "Offline":      ["Samples_per_second", "target_qps"],
+        "SingleStream": ["90th_percentile_latency", "_Early_stopping_90th_percentile_estimate", "early_stopping_overhead"],
+        "MultiStream":  ["99th_percentile_latency", "_Early_stopping_99th_percentile_estimate", "early_stopping_overhead"],
+        "Server":       ["target_qps", "99.00_percentile_latency", "target_latency", "latency_cutoff_ratio", "99.00_percentile_first_token_latency", "ttft_latency", "cutoff_ratio_ttft", "99.00_percentile_time_to_output_token", "tpot_latency", "cutoff_ratio_tpot", "Completed_samples_per_second"]
     },
 
     "performance": ["^^", "parse_performance"],