Update the sdp benchmark to work with nested tensors (pytorch#87215)

drisspg · pytorchmergebot · commit d06d569e90f3 · 2022-10-18T21:38:45.000Z
# Summary Update the sdp benchmark to work with nested tensors Pull Request resolved: pytorch#87215 Approved by: https://github.com/cpuhrsch
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -399,7 +399,7 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
 
   // If the physical layout of the NestedTensor's storage
   // is not: batch, {seq_len}, num_heads, head_dim then we need
-  //  to call contiguous
+  // to call contiguous
   if (!is_safe_to_get_storage_as_tensor(query_impl, key_impl, value_impl)) {
     q_t = q_t.contiguous();
     k_t = k_t.contiguous();
diff --git a/benchmarks/transformer/sdp.py b/benchmarks/transformer/sdp.py
@@ -3,8 +3,10 @@
 import numpy as np
 import sys
 import csv
+import random
 
-
+import warnings
+warnings.filterwarnings("ignore")
 class CompositeMHA(torch.nn.Module):
     def __init__(self, num_heads, in_proj_weight, in_proj_bias, out_proj):
         super().__init__()
@@ -25,14 +27,15 @@ def forward(self, query, key, value, mask):
             query, self.in_proj_weight, self.in_proj_bias
         )
 
-        batch_size, seq_len, embed_dim = query_projected.size()
+        batch_size = query_projected.size(0)
+        embed_dim = query_projected.size(2)
         head_dim = embed_dim // (self.num_heads * 3)
 
-        # Transpose seq_len and num_heads dim
-        query_projected = query_projected.view(
-            batch_size, seq_len, 3 * self.num_heads, head_dim
-        ).transpose(1, 2)
-        query, key, value = query_projected.chunk(3, 1)
+        query, key, value = query_projected.chunk(3, -1)
+
+        query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
 
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         attn, _ = torch.nn.functional._scaled_dot_product_attention(
@@ -46,7 +49,7 @@ def forward(self, query, key, value, mask):
         )
 
         attn = attn.transpose(1, 2).reshape(
-            batch_size, seq_len, self.num_heads * head_dim
+            batch_size, -1, self.num_heads * head_dim
         )
         # Match return signature of nn.MHA
         return self.out_proj(attn), None
@@ -60,6 +63,18 @@ def build_composite_mha_from_nn_mha(pt):
     return CompositeMHA(pt.num_heads, pt.in_proj_weight, pt.in_proj_bias, pt.out_proj)
 
 
+def generate_rand_batch(batch_size, max_sequence_len, embed_dimension, pad_percentage=None, dtype=torch.float16, device="cuda"):
+    if not pad_percentage:
+        return torch.randn(batch_size, max_sequence_len, embed_dimension, dtype=dtype, device=device), None
+    # Really slow but should work
+    seq_len_list = [int(max_sequence_len * (1 - random.gauss(pad_percentage, 0.01))) for _ in range(batch_size)]
+    # Make random ele max length
+    seq_len_list[random.randint(0, batch_size - 1)] = max_sequence_len
+    # print(f"Theoretical padding: {pad_percentage} actual: {1 - (sum(seq_len_list) / (batch_size * max_sequence_len))}")
+    return torch.nested.nested_tensor([
+        torch.randn(seq_len, embed_dimension, dtype=dtype, device=device) for seq_len in seq_len_list]), seq_len_list
+
+
 def benchmark_torch_function(iters, f, *args, **kwargs):
     if f is None:
         return None
@@ -75,50 +90,57 @@ def benchmark_torch_function(iters, f, *args, **kwargs):
     return (start_event.elapsed_time(end_event) * 1.0e-3) / iters
 
 
-def run_timing(batch_size, D, H, L, writer):
-    dropout_p = 0.0
-    mask = None
-
-    pt = torch.nn.MultiheadAttention(
-        embed_dim=D, num_heads=H, batch_first=True, dropout=dropout_p
-    )
-    npt = pt.eval().half().cuda()
-    cpt = build_composite_mha_from_nn_mha(npt)
-
-    x = torch.randn(batch_size, L, D)
-    x = x.half().cuda()
-
-    pt_output, _ = pt(x, x, x, mask)
-    cp_output, _ = cpt(x, x, x, mask)
+def run_timing(iters, batch_size, embed_dimension, num_heads, max_sequence_len, pad_percentage, writer):
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True):
+        with torch.inference_mode():
+            dropout_p = 0.0
+            mask = None
 
-    # First order sanity check. Not a replacement for rigorous tests.
-    assert torch.allclose(pt_output, cp_output, atol=1e-3, rtol=1e-3)
+            pt = torch.nn.MultiheadAttention(
+                embed_dim=embed_dimension, num_heads=num_heads, batch_first=True, dropout=dropout_p
+            )
+            npt = pt.eval().half().cuda()
+            cpt = build_composite_mha_from_nn_mha(npt)
+            x, lengths = generate_rand_batch(batch_size, max_sequence_len, embed_dimension, pad_percentage)
+            pt_output, _ = pt(x, x, x, mask)
+            cpt_output, _ = cpt(x, x, x, mask)
+
+            # First order sanity check. Not a replacement for rigorous tests.
+            if pt_output.is_nested and cpt_output.is_nested:
+                for a, b in zip(pt_output.unbind(), cpt_output.unbind()):
+                    assert torch.allclose(a, b, atol=1e-3, rtol=1e-3)
+            else:
+                assert torch.allclose(pt_output, cpt_output, atol=1e-3, rtol=1e-3)
 
-    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_flash=True):
-        with torch.inference_mode():
             pt_time = benchmark_torch_function(iters, npt, x, x, x, mask) * 1e3
             cp_time = benchmark_torch_function(iters, cpt, x, x, x, mask) * 1e3
             results = {}
-            results["L"] = L
-            results["H"] = H
-            results["D"] = D
+            results["max_sequence_len"] = max_sequence_len
+            results["num_heads"] = num_heads
+            results["embed_dimension"] = embed_dimension
             results["pt_time"] = pt_time
             results["cp_time"] = cp_time
             results["speedup"] = pt_time / cp_time
             results["dtype"] = str(x.dtype)
             writer.writerow(results)
 
 
-if __name__ == "__main__":
+def main():
     iters = 100
     seed = 123
     np.random.seed(seed)
     torch.manual_seed(seed)
 
-    headers = ["L", "H", "D", "pt_time", "cp_time", "speedup", "dtype"]
+    headers = ["max_sequence_len", "num_heads", "embed_dimension", "pt_time", "cp_time", "speedup", "dtype"]
     writer = csv.DictWriter(sys.stdout, headers)
     writer.writeheader()
 
     batch_size = 64
-    for H, L in itertools.product([1, 2, 4, 8, 16, 32], [64, 128, 256]):
-        run_timing(batch_size, 1024, H, L, writer)
+    pad_percentage = 0.5
+
+    for num_heads, max_seq_len in itertools.product([2, 4, 8, 16, 32], [64, 128, 256]):
+        run_timing(iters, batch_size, 1024, num_heads, max_seq_len, pad_percentage, writer)
+
+
+if __name__ == "__main__":
+    main()