This PR fixes the causal mask whenseq_len_q!=seq_len_KV (#314)

mehdi-goli · muhammad-tanvir-1211 · web-flow · commit f649f6933b1a · 2025-04-18T12:28:04.000+01:00
This PR adopt (https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_interface.py#L1087) When the seq_len_q !=seq_len_kv --------- Co-authored-by: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
diff --git a/applications/flash_attention_v2/kernel/xe_flash_attn_gemm.hpp b/applications/flash_attention_v2/kernel/xe_flash_attn_gemm.hpp
@@ -251,6 +251,17 @@ class GemmUniversalAttention {
         continue;
       }
 
+      auto offset = cute::min(seq_len_qo, seq_len_kv); //(2048, 1024)
+      auto discard_seq_coord = seq_len_qo - offset; //1024
+      auto full_tile_offset = seq_len_kv - offset; //0
+      const int seq_coord = cute::min(seq_len_qo, blk_m_coord * QK_BLK_M + (sub_group_id / PV_ATOM_N) * QK_SG_M) ;
+      
+      const int seq_len = CausalMask ? full_tile_offset + cute::min(seq_len_kv, seq_coord - discard_seq_coord) + QK_SG_M : seq_len_kv;
+      const int nblock_limit = cute::ceil_div(seq_len, QK_BLK_N);
+      if(CausalMask && seq_coord < discard_seq_coord ) { // 1024 =0
+        continue;
+      }
+    
       Tensor mQ_mkl = cute::get_pvc_tensor(make_shape(seq_len_qo, head_size_qk, (is_var_len ? 1 : batch) * num_heads_q));   //(m,k,l)
       Tensor mK_nkl = cute::get_pvc_tensor(make_shape(seq_len_kv, head_size_qk, (is_var_len ? 1 : batch) * num_head_kv));   //(n,k,l)
       Tensor mV_nkl = cute::get_pvc_tensor(make_shape(head_size_vo, seq_len_kv, (is_var_len ? 1 : batch) * num_head_kv));   //(n,k,l)
@@ -261,15 +272,7 @@ class GemmUniversalAttention {
       auto gQ = local_tile(mQ_mk, TileShapeQK{}, make_coord(blk_m_coord, _, _), Step<_1,  X, _1>{});
       auto gK = local_tile(mK_nk, TileShapeQK{}, make_coord(_, _ , _), Step<X, _1, _1>{});
       auto gV = local_tile(mV_nk, TileShapePV{}, make_coord(_, blk_n_coord, _), Step<X, _1, _1>{});
-
-      const int seq_coord = cute::min(seq_len_qo, blk_m_coord * QK_BLK_M + (sub_group_id / PV_ATOM_N) * QK_SG_M);
-
-      const int causal_seq_len = cute::min(seq_len_kv, seq_coord) + QK_SG_M;
-      const int non_causal_seq_len = seq_len_kv;
-
-      const int nblock_limit = CausalMask ? cute::ceil_div(causal_seq_len, QK_BLK_N)
-                                          : cute::ceil_div(non_causal_seq_len, QK_BLK_N);
-
+      
       auto mainloop_params = CollectiveMainloop::get_updated_copies(params.mainloop, params.problem_shape, batch_coord);
 
       auto tiled_prefetch_q = cute::prefetch_selector<Shape<Int<QK_BLK_M>, Int<QK_BLK_K>>, Num_SGs>(mainloop_params.gmem_tiled_copy_q);
@@ -361,7 +364,7 @@ class GemmUniversalAttention {
             int row_idx = m * Vec + seq_coord;
             CUTLASS_PRAGMA_UNROLL
             for (int row = 0; row < Vec; row++, row_idx++) { // 8
-              if (col_idx > row_idx)
+              if ((col_idx - full_tile_offset) > (row_idx - discard_seq_coord))
                 tSr(row, m, n) = -INFINITY;
             }
           }
diff --git a/benchmarks/pvc/flash_attention_v2/benchmark_runner.hpp b/benchmarks/pvc/flash_attention_v2/benchmark_runner.hpp
@@ -179,7 +179,7 @@ template <class FMHAConfiguration> struct BenchmarkRunnerFMHA {
     int offset_o = 0;
     // loop over the batch dimension to compute the output
     // to avoid the risk of running out of device memory
-    int q_group_size = num_heads_q/num_heads_kv;
+    int q_group_size = num_heads_q / num_heads_kv;
     for (int b = 0; b < batch; b++) {
       if constexpr (isVarLen) {
         auto logical_problem_shape = cutlass::fmha::collective::apply_variable_length(problem_size, b);
@@ -218,12 +218,14 @@ template <class FMHAConfiguration> struct BenchmarkRunnerFMHA {
 
         // delete this memory as it is no longer needed
         block_S.reset();
-
+        auto offset = cute::min(seq_len_qo, seq_len_kv);
+        auto discard_seq_coord = seq_len_qo - offset;
+        auto full_tile_offset = seq_len_kv - offset;
         if constexpr (Causal) {
           // apply mask to S
           for (int row = 0; row < seq_len_qo; row++) {
             for (int col = 0; col < seq_len_kv; col++) {
-              if (col > row)
+              if ((col - full_tile_offset) > (row - discard_seq_coord))
                 host_S[col + row * seq_len_kv] = -INFINITY;
             }
           }
@@ -263,7 +265,11 @@ template <class FMHAConfiguration> struct BenchmarkRunnerFMHA {
           idx = row * seq_len_kv;
           sum_idx = row;
           for (int col = 0; col < seq_len_kv; col++, idx++) {
-            host_S[idx] /= sum_vec[sum_idx];
+            if(Causal && row < discard_seq_coord) {
+              host_S[idx] = 0;
+            } else {
+              host_S[idx] /= sum_vec[sum_idx];
+            }
           }
         }
 
diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
@@ -232,13 +232,14 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
 
         // delete this memory as it is no longer needed
         block_S.reset();
-
-        // Apply upper-diagonal masking if required
+        auto offset = cute::min(seq_len_qo, seq_len_kv);
+        auto discard_seq_coord = seq_len_qo - offset;
+        auto full_tile_offset = seq_len_kv - offset;
         if (is_causal) {
           // apply mask to S
           for (int row = 0; row < seq_len_qo; row++) {
             for (int col = 0; col < seq_len_kv; col++) {
-              if (col > row)
+              if ((col - full_tile_offset) > (row - discard_seq_coord))
                 host_S[col + row * seq_len_kv] = -INFINITY;
             }
           }
@@ -278,7 +279,11 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
           idx = row * seq_len_kv;
           sum_idx = row;
           for (int col = 0; col < seq_len_kv; col++, idx++) {
-            host_S[idx] /= sum_vec[sum_idx];
+            if(is_causal && row < discard_seq_coord) { 
+              host_S[idx] = 0;
+            } else {
+              host_S[idx] /= sum_vec[sum_idx];
+            }
           }
         }