pytorch
diff --git a/‎test/prototype/test_sparse_api.py renamed to ‎test/sparsity/test_sparse_api.py
Lines changed: 4 additions & 5 deletions b/‎test/prototype/test_sparse_api.py renamed to ‎test/sparsity/test_sparse_api.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎torchao/_models/llama/generate.py
Lines changed: 35 additions & 4 deletions b/‎torchao/_models/llama/generate.py
Lines changed: 35 additions & 4 deletions
diff --git a/‎torchao/kernel/__init__.py
Lines changed: 2 additions & 0 deletions b/‎torchao/kernel/__init__.py
Lines changed: 2 additions & 0 deletions
@@ -132,8 +132,9 @@ class TestBlockSparseWeight(common_utils.TestCase):
     )
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @common_utils.parametrize("compile", [True, False])
-    def test_sparse(self, compile):
-        input = torch.rand((1024, 1024)).half().cuda()
+    @common_utils.parametrize("input_shape", [1, 1024])
+    def test_sparse(self, compile, input_shape):
+        input = torch.rand((input_shape, 1024)).half().cuda()
         model = (
             nn.Sequential(
                 nn.Linear(1024, 2048),
@@ -152,9 +153,7 @@ def test_sparse(self, compile):
         model[1].weight.data = create_block_sparse_tensor(M, N, 64, 0.5, torch.float16)
         dense_result = model(input)
 
-        from torchao.prototype.sparsity.superblock.blocksparse import (
-            block_sparse_weight,
-        )
+        from torchao.sparsity import block_sparse_weight
 
         sparsify_(model, block_sparse_weight(blocksize=64))
         # if compile:
 
@@ -793,9 +793,37 @@ def ffn_or_attn_only(mod, fqn):
         from torchao.sparsity import semi_sparse_weight, sparsify_
 
         if "semi" in sparsity:
-            # TODO there is a bug here, need to fix
+            # Fixed sparsity level for 2:4
             sparsify_(model.to(device), semi_sparse_weight(), filter_fn=ffn_only)
 
+        if "bsr" in sparsity:
+            from torchao.sparsity import SupermaskLinear, block_sparse_weight
+
+            # parse "bsr-0.9-64"
+            _, sparsity_level, blocksize = sparsity.split("-")
+            sparsity_level, blocksize = float(sparsity_level), int(blocksize)
+            sparsify_(
+                model,
+                lambda x: SupermaskLinear.from_linear(
+                    x,
+                    sparsity_level=sparsity_level,
+                    blocksize=blocksize,
+                ),
+                filter_fn=ffn_only,
+            )
+            print(model)
+            sparsify_(
+                model,
+                SupermaskLinear.to_linear,
+                filter_fn=ffn_only,
+            )
+            print(model)
+
+            # Accelerate with triton bsr kernels
+            sparsify_(
+                model, block_sparse_weight(blocksize=blocksize), filter_fn=ffn_only
+            )
+
     model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9
 
     if save:
@@ -810,7 +838,10 @@ def ffn_or_attn_only(mod, fqn):
         print("Compiling Model")
         global decode_one_token, prefill
         decode_one_token = torch.compile(
-            decode_one_token, mode="reduce-overhead", fullgraph=True
+            decode_one_token,
+            mode="reduce-overhead",
+            fullgraph=True,
+            dynamic=True,
         )
 
         if compile_prefill:
@@ -849,7 +880,7 @@ def ffn_or_attn_only(mod, fqn):
                 prompt = f"{B_INST} {prompt.strip()} {E_INST}"
             encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)
 
-        if interactive and i >= 0:
+        if interactive and i >= 0 and prefill_size is None:
             buffer = []
             period_id = tokenizer.encode(".")[0]
             done_generating = False
@@ -919,7 +950,7 @@ def callback(x):
         device_sync(device=device)  # MKG
         t = time.perf_counter() - t0
 
-        if not interactive and demo_summarize_prompt is None:
+        if not interactive and demo_summarize_prompt is None and prefill_size is None:
             tok_list = y[0].tolist()
             # truncate text after end of string token
             tokens = (
 
@@ -1,6 +1,8 @@
+from torchao.kernel.bsr_triton_ops import bsr_dense_addmm
 from torchao.kernel.intmm import int_scaled_matmul, safe_int_mm
 
 __all__ = [
+    "bsr_dense_addmm",
     "safe_int_mm",
     "int_scaled_matmul",
 ]
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,8 @@`
	`1`	`+from torchao.kernel.bsr_triton_ops import bsr_dense_addmm`
`1`	`2`	`from torchao.kernel.intmm import int_scaled_matmul, safe_int_mm`
`2`	`3`
`3`	`4`	`__all__ = [`
	`5`	`+ "bsr_dense_addmm",`
`4`	`6`	`"safe_int_mm",`
`5`	`7`	`"int_scaled_matmul",`
`6`	`8`	`]`