remove repeat_interleave since flex_decoding supports gqa

BoyuanFeng · BoyuanFeng · commit 9ba2eac49e4e · 2024-12-12T18:48:22.000-08:00
diff --git a/model.py b/model.py
@@ -219,8 +219,6 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: BlockMask, input_pos: Opti
         if self.kv_cache is not None:
             k, v = self.kv_cache.update(input_pos, k, v)
 
-        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
         y = flex_attention(q, k, v, block_mask=mask, enable_gqa=(self.n_head != self.n_local_heads))
 
         y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)