File tree 1 file changed +8
-5
lines changed
torchao/csrc/cuda/sparse_marlin
1 file changed +8
-5
lines changed Original file line number Diff line number Diff line change @@ -401,10 +401,13 @@ __global__ void Marlin_24(
401
401
meta_ptr[i] += m_gl_rd_delta_o;
402
402
}
403
403
// Only fetch scales if this tile starts a new group
404
- if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0 ) {
405
- int4 * sh_s_stage = sh_s + s_sh_stage * pipe ;
406
- if (s_sh_wr_pred) cp_async4 (&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
407
- s_gl_rd += s_gl_rd_delta;
404
+ if constexpr (group_blocks != -1 ) {
405
+ if (pipe % (group_blocks / thread_k_blocks) == 0 ) {
406
+ int4 *sh_s_stage = sh_s + s_sh_stage * pipe ;
407
+ if (s_sh_wr_pred)
408
+ cp_async4 (&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
409
+ s_gl_rd += s_gl_rd_delta;
410
+ }
408
411
}
409
412
}
410
413
// Insert a fence even when we are winding down the pipeline to ensure that
@@ -429,7 +432,7 @@ __global__ void Marlin_24(
429
432
// however, this does not seem to be a significant bottleneck, while some
430
433
// theoretically better attempts have lead to bad instruction ordering by
431
434
// the compiler and correspondingly a noticeable drop in performance.
432
- if (group_blocks != -1 ) {
435
+ if constexpr (group_blocks != -1 ) {
433
436
int4 * sh_s_stage =
434
437
sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
435
438
(pipe / (group_blocks / thread_k_blocks)));
You can’t perform that action at this time.
0 commit comments