Add --bs and adjust mean perplexity

nod-ai · Nov 22, 2024 · 33d25e1 · 33d25e1
1 parent 1b1cec7
commit 33d25e1
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 43 deletions.
diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
@@ -75,7 +75,7 @@ jobs:
             iree-base-runtime
 
       - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --run-nightly-llama-tests --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --run-nightly-llama-tests --bs=100 --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
 
   test_perplexity_torch:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}

diff --git a/.github/workflows/ci_eval_short.yaml b/.github/workflows/ci_eval_short.yaml
@@ -74,4 +74,4 @@ jobs:
             iree-base-runtime
 
       - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --run-quick-llama-test --num-prompts=5 --iree-device='hip://6' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --run-quick-llama-test --bs=5 --iree-device='hip://6' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -7,6 +7,7 @@
 import unittest
 import pytest
 import json
+import numpy as np
 
 from sharktank.evaluate import perplexity_vmfb
 from sharktank.utils.export_artifacts import (
@@ -52,18 +53,20 @@ def test_llama3_8B_f16_decomposed(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=decomposed",
-                f"--num-prompts=5",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
@@ -86,17 +89,20 @@ def test_llama3_8B_f16(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=torch_sdpa",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
@@ -119,17 +125,20 @@ def test_llama3_8B_fp8_decomposed(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=decomposed",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
@@ -152,17 +161,20 @@ def test_llama3_8B_fp8(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=torch_sdpa",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
@@ -187,17 +199,20 @@ def test_llama3_405B_f16_decomposed(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=decomposed",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
@@ -220,17 +235,20 @@ def test_llama3_405B_f16(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=torch_sdpa",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
@@ -253,17 +271,20 @@ def test_llama3_405B_fp8_decomposed(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=decomposed",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
@@ -286,17 +307,20 @@ def test_llama3_405B_fp8(self):
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=torch_sdpa",
+                f"--num-prompts={self.bs}",
             ]
         )
 
-        perplexity_difference = (
-            current_perplexity["mean_perplexity"]
-            - baseline_perplexity["mean_perplexity"]
+        baseline_mean_perplexity = round(
+            np.mean(baseline_perplexity["perplexities"][0 : self.bs]), 6
         )
+        current_mean_perplexity = round(current_perplexity["mean_perplexity"], 6)
+
+        perplexity_difference = current_mean_perplexity - baseline_mean_perplexity
 
         self.assertAlmostEqual(
-            baseline_perplexity["mean_perplexity"],
-            current_perplexity["mean_perplexity"],
+            baseline_mean_perplexity,
+            current_mean_perplexity,
             delta=self.delta,
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )