Add prefill tps in oga-bench

jiafatom · jiafatom · commit 0394a75e41f5 · 2024-12-13T19:38:08.000Z
Signed-off-by: David Fan &lt;jiafa@microsoft.com&gt;
diff --git a/src/turnkeyml/llm/cache.py b/src/turnkeyml/llm/cache.py
@@ -21,9 +21,10 @@ class Keys:
     PER_ITERATION_LATENCY = "per_iteration_latency"
     MEAN_LATENCY = "mean_latency"
     STD_DEV_LATENCY = "std_dev_latency"
-    MEAN_TOKENS_PER_SECOND = "mean_tokens_per_second"
+    TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
     STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
     SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
+    PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
     STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
     CHECKPOINT = "checkpoint"
     DTYPE = "dtype"
diff --git a/src/turnkeyml/llm/tools/huggingface_bench.py b/src/turnkeyml/llm/tools/huggingface_bench.py
@@ -110,7 +110,7 @@ class HuggingfaceBench(Tool):
     def __init__(self):
         super().__init__(monitor_message="Benchmarking Huggingface LLM")
 
-        self.status_stats = [Keys.SECONDS_TO_FIRST_TOKEN, Keys.MEAN_TOKENS_PER_SECOND]
+        self.status_stats = [Keys.SECONDS_TO_FIRST_TOKEN, Keys.TOKEN_GENERATION_TOKENS_PER_SECOND]
 
     @staticmethod
     def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
@@ -283,11 +283,11 @@ def run(
             [token_len for _, token_len in decode_per_iteration_result]
         )
         # Subtract 1 so that we don't count the prefill token
-        mean_tokens_per_second = (mean_token_len - 1) / mean_decode_latency
+        token_generation_tokens_per_second = (mean_token_len - 1) / mean_decode_latency
 
         # Save performance data to stats
         state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token)
-        state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second)
+        state.save_stat(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second)
         state.save_stat(Keys.PROMPT_TOKENS, input_ids.shape[1])
 
         return state
diff --git a/src/turnkeyml/llm/tools/ort_genai/oga_bench.py b/src/turnkeyml/llm/tools/ort_genai/oga_bench.py
@@ -32,7 +32,8 @@ def __init__(self):
 
         self.status_stats = [
             Keys.SECONDS_TO_FIRST_TOKEN,
-            Keys.MEAN_TOKENS_PER_SECOND,
+            Keys.PREFILL_TOKENS_PER_SECOND,
+            Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
             Keys.PROMPT_TOKENS,
         ]
 
@@ -144,10 +145,12 @@ def run(
                 per_iteration_tokens_per_second.append(model.tokens_per_second)
 
         mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
-        mean_tokens_per_second = statistics.mean(per_iteration_tokens_per_second)
+        prefill_tokens_per_second = input_ids_len / mean_time_to_first_token
+        token_generation_tokens_per_second = statistics.mean(per_iteration_tokens_per_second)
 
         state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token)
-        state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second)
+        state.save_stat(Keys.PREFILL_TOKENS_PER_SECOND, prefill_tokens_per_second)
+        state.save_stat(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second)
         state.save_stat(Keys.PROMPT_TOKENS, input_ids_len)
 
         return state
diff --git a/test/llm_api.py b/test/llm_api.py
@@ -78,7 +78,7 @@ def test_001_huggingface_bench(self):
 
         stats = fs.Stats(state.cache_dir, state.build_name).stats
 
-        assert stats[Keys.MEAN_TOKENS_PER_SECOND] > 0
+        assert stats[Keys.TOKEN_GENERATION_TOKENS_PER_SECOND] > 0
 
 
 if __name__ == "__main__":