fix comments

BoyuanFeng · BoyuanFeng · commit 4e917f58192e · 2024-08-28T16:44:14.000-07:00
diff --git a/generate.py b/generate.py
@@ -12,7 +12,7 @@
 import torch
 import torch._dynamo.config
 import torch._inductor.config
-from torch.nn.attention.flex_attention import create_block_mask
+from torch.nn.attention.flex_attention import BlockMask, create_block_mask
 
 def device_sync(device):
     if "cuda" in device:
@@ -58,26 +58,29 @@ def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
 def roundup(val, multiplier):
     return ((val - 1) // multiplier + 1) * multiplier
 
+def causal_mask(b, h, q, kv):
+    return q >= kv
+
 def prefill(model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs) -> torch.Tensor:
     # input_pos: [B, S]
-    mask = create_block_mask(model.get_mask_mod(0), 1, 1, input_pos.shape[0], model.max_seq_length, device="cuda")
+    mask = create_block_mask(causal_mask, 1, 1, input_pos.shape[0], model.max_seq_length, device="cuda")
     logits = model(mask, x, input_pos)
     return sample(logits, **sampling_kwargs)[0]
 
-def decode_one_token(model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+def decode_one_token(model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, block_mask: BlockMask, **sampling_kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
     # input_pos: [B, 1]
     assert input_pos.shape[-1] == 1
-    block_index = input_pos // model.block_mask.BLOCK_SIZE[0]
-    mask = model.block_mask[:, :, block_index]
-    mask.mask_mod = model.get_mask_mod(input_pos[0])
+    block_index = input_pos // block_mask.BLOCK_SIZE[0]
+    mask = block_mask[:, :, block_index]
     logits = model(mask, x, input_pos)
     return sample(logits, **sampling_kwargs)
 
 def decode_n_tokens(model: Transformer, cur_token: torch.Tensor, input_pos: torch.Tensor, num_new_tokens: int, callback=lambda _: _, **sampling_kwargs):
+    block_mask = create_block_mask(causal_mask, 1, 1, model.max_seq_length, model.max_seq_length, device="cuda")
     new_tokens, new_probs = [], []
     for i in range(num_new_tokens):
         next_token, next_prob = decode_one_token(
-            model, cur_token, input_pos, **sampling_kwargs
+            model, cur_token, input_pos, block_mask, **sampling_kwargs
         )
         input_pos += 1
         new_tokens.append(next_token.clone())
diff --git a/model.py b/model.py
@@ -24,11 +24,11 @@ def find_multiple(n: int, k: int) -> int:
     return n + k - (n % k)
 
 
-def get_causal_mask(offset):
-    def causal_mask(b, h, q, kv):
-        return offset + q >= kv
+def get_mask_mod(mask_mod: _mask_mod_signature, offset: int):
+    def _mask_mod(b, h, q, kv):
+        return mask_mod(b, h, q + offset, kv)
 
-    return causal_mask
+    return _mask_mod
 
 
 @dataclass
@@ -103,7 +103,7 @@ def update(self, input_pos, k_val, v_val):
         return k_out, v_out
 
 class Transformer(nn.Module):
-    def __init__(self, config: ModelArgs, get_mask_mod: Callable[[int], _mask_mod_signature]) -> None:
+    def __init__(self, config: ModelArgs) -> None:
         super().__init__()
         self.config = config
 
@@ -135,10 +135,10 @@ def setup_caches(self, max_batch_size, max_seq_length):
             b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype)
 
         self.freqs_cis = precompute_freqs_cis(self.config.block_size, self.config.dim // self.config.n_head, self.config.rope_base, dtype)
-        self.block_mask = create_block_mask(self.get_mask_mod(0), 1, 1, max_seq_length, max_seq_length, device="cuda")
 
     def forward(self, mask: BlockMask, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         assert self.freqs_cis is not None, "Caches must be initialized first"
+        mask.mask_mod = self.get_mask_mod(mask.mask_mod, input_pos[0])
         freqs_cis = self.freqs_cis[input_pos]
         x = self.tok_embeddings(idx)
 
@@ -150,7 +150,7 @@ def forward(self, mask: BlockMask, idx: Tensor, input_pos: Optional[Tensor] = No
 
     @classmethod
     def from_name(cls, name: str):
-        return cls(ModelArgs.from_name(name), get_causal_mask)
+        return cls(ModelArgs.from_name(name))
 
 
 class TransformerBlock(nn.Module):