replaces some logic with index ops (#176)

dan-garvey · web-flow · commit e52b086f30ca · 2024-09-09T15:46:06.000-07:00
fixes bs1/direct cache exports.
diff --git a/sharktank/sharktank/layers/paged_llama_attention_block.py b/sharktank/sharktank/layers/paged_llama_attention_block.py
@@ -181,15 +181,21 @@ def transact_cache_direct(
             return xk_cache_update, xv_cache_update
         else:
             # Decode. Write a single timestep.
-            # TODO: This needs to be reworked with index ops.
             assert xk_cache_update.shape[1] == 1
             assert xv_cache_update.shape[1] == 1
-            max_start_pos = 0
-            for row_index in range(bs):
-                row_start_pos = start_positions[row_index].item()
-                max_start_pos = max(row_start_pos, max_start_pos)
-                cache_k[row_index, row_start_pos] = xk_cache_update[row_index, 0]
-                cache_v[row_index, row_start_pos] = xv_cache_update[row_index, 0]
+            for b in range(bs):
+                # Make a tensor because indices must be all tensors, so we can avoid
+                # doing start_positions[row_index].item(), which generates a lot of SymInts.
+                row_index = torch.tensor(
+                    b, dtype=torch.int64, device=xk_cache_update.device
+                )
+                row_start_pos = start_positions[row_index]
+                cache_k.index_put(
+                    (row_index, row_start_pos), xk_cache_update[row_index, 0]
+                )
+                cache_v.index_put(
+                    (row_index, row_start_pos), xv_cache_update[row_index, 0]
+                )
             return cache_k[:, :kv_seq_len], cache_v[:, :kv_seq_len]
 
     def transact_cache_paged(