From 381ef76d6e2f0f8421f3a5e6ee769a14cb6249cc Mon Sep 17 00:00:00 2001
From: Apurva Jain <appy@meta.com>
Date: Thu, 3 Apr 2025 23:06:51 -0700
Subject: [PATCH] Move model benchmarking

---
 benchmarks/models/llama/README.md             |   33 +
 benchmarks/models/llama/_eval.py              |  362 +++++
 benchmarks/models/llama/benchmark_results.txt |   52 +
 benchmarks/models/llama/benchmarks.sh         |  118 ++
 benchmarks/models/llama/bsr_bench_results.txt |   27 +
 benchmarks/models/llama/bsr_benchmarks.sh     |   17 +
 benchmarks/models/llama/demo_summarize.sh     |   13 +
 benchmarks/models/llama/eval.py               |  355 +++++
 benchmarks/models/llama/evals.sh              |   28 +
 benchmarks/models/llama/generate.py           | 1255 +++++++++++++++++
 benchmarks/models/sam/benchmark.sh            |   17 +
 benchmarks/models/sam/eval_combo.py           |  692 +++++++++
 benchmarks/models/sam/flash_4_configs.p       |  Bin 0 -> 219 bytes
 benchmarks/models/sam/metrics.py              |   76 +
 benchmarks/models/sam/results.csv             |    7 +
 benchmarks/models/utils.py                    |  111 ++
 16 files changed, 3163 insertions(+)
 create mode 100644 benchmarks/models/llama/README.md
 create mode 100644 benchmarks/models/llama/_eval.py
 create mode 100644 benchmarks/models/llama/benchmark_results.txt
 create mode 100644 benchmarks/models/llama/benchmarks.sh
 create mode 100644 benchmarks/models/llama/bsr_bench_results.txt
 create mode 100644 benchmarks/models/llama/bsr_benchmarks.sh
 create mode 100644 benchmarks/models/llama/demo_summarize.sh
 create mode 100644 benchmarks/models/llama/eval.py
 create mode 100644 benchmarks/models/llama/evals.sh
 create mode 100644 benchmarks/models/llama/generate.py
 create mode 100755 benchmarks/models/sam/benchmark.sh
 create mode 100644 benchmarks/models/sam/eval_combo.py
 create mode 100644 benchmarks/models/sam/flash_4_configs.p
 create mode 100644 benchmarks/models/sam/metrics.py
 create mode 100644 benchmarks/models/sam/results.csv
 create mode 100644 benchmarks/models/utils.py

diff --git a/benchmarks/models/llama/README.md b/benchmarks/models/llama/README.md
new file mode 100644
index 0000000000..99f1919fc9
--- /dev/null
+++ b/benchmarks/models/llama/README.md
@@ -0,0 +1,33 @@
+# Llama Benchmarks
+
+The llama folder contains code/scripts for stable benchmarking llama models.
+
+To get model weights, go to https://huggingface.co/meta-llama/Llama-2-7b, https://huggingface.co/meta-llama/Meta-Llama-3-8B, https://huggingface.co/meta-llama/Meta-Llama-3.1-8B
+and follow the steps to gain access.
+
+Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to
+download and convert the model weights
+
+once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation
+directly using `generate.py` or `eval.py`.
+
+## KV Cache Quantization - Memory Efficient Inference
+We've added some features to `model.py` compared to the original gpt-fast implementation in order to enable long context length (and necessarily memory efficient) inference. Specifically we've added kv_cache quantization and a linear_causal_mask implementation which are **able to reduce memory usage by 50-60%** at long context lengths.
+
+In practice these features alongside int4 weight only quantization allow us to do Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.**
+
+You can check it out yourself with `generate.py`, these features exist as a proof of concept and technical demonstration of the techniques though we're working to figure out a way to release them in a general way. Until then feel free to copy these features into your own models. The details and a full explanation can be found in this [PR](https://github.com/pytorch/ao/pull/738)
+
+To see how these techniques scale generally we've run `generate.py` with subsets of these features for different context lengths on an A100 GPU. You can find commands to reproduce these numbers in `benchmarks.sh`
+
+| context length (tokens) | normal peak (GB) | kv_quant peak (GB) | kv quant+linear_causal_mask peak (GB) |
+|-------------------------|------------------|--------------------|---------------------------------------|
+|                    8192 |            17.86 |              17.52 |                                 17.47 |
+|                   16384 |            19.81 |              18.75 |                                 18.48 |
+|                   32768 |            23.83 |              21.72 |                                 20.64 |
+|                   65536 |             33.5 |              29.54 |                                 25.24 |
+|                  131072 |            59.27 |              52.62 |                                 34.18 |
+
+## Adding Benchmarks For New Techniques
+
+If you want to add benchmarks that you think should be kept up to date, please try to keep the format consistent. For performance focused techniques (e.g. if they require fine-tuning or something else) add an option to run them in generate.py and an execution command in benchmarks.sh in the relevant section. If its a technique that's still in development, add it in the section for `OTHER BENCHMARKS` if there's a finalized api and you want those numbers in the main quantization README, add them in the `README BENCHMARKS` section. For accuracy focused techniques, add them in eval.py and evaluations.sh in a similar vein. Ideally techniques in the main readme will have both benchmarks and evaluations set up here so they can be monitored and reproduced easily.
diff --git a/benchmarks/models/llama/_eval.py b/benchmarks/models/llama/_eval.py
new file mode 100644
index 0000000000..9f429278e3
--- /dev/null
+++ b/benchmarks/models/llama/_eval.py
@@ -0,0 +1,362 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import lm_eval
+import torch
+import torch.nn.functional as F
+
+from torchao.quantization.GPTQ_MT import MultiTensor
+from torchao.quantization.utils import _MultiInput
+
+try:  # lm_eval version 0.4
+    from lm_eval.evaluator import evaluate  # pyre-ignore[21]
+    from lm_eval.models.huggingface import HFLM as eval_wrapper  # pyre-ignore[21]
+    from lm_eval.tasks import get_task_dict  # pyre-ignore[21]
+except:  # lm_eval version 0.3
+    from lm_eval import base, evaluator, tasks
+
+    eval_wrapper = base.BaseLM
+    get_task_dict = tasks.get_task_dict
+    evaluate = evaluator.evaluate
+
+
+class MultiTensorInputRecorder(eval_wrapper):
+    def __init__(
+        self,
+        tokenizer,
+        calibration_seq_length,
+        input_prep_func=None,
+        pad_calibration_inputs=False,
+        vocab_size=32000,
+        pad_token=0,
+        device="cpu",
+    ):
+        try:
+            super().__init__()
+        except TypeError:
+            # lm_eval 0.4.2 removed the default init
+            super().__init__("gpt2", device="cpu")
+
+        self.tokenizer = tokenizer
+        self._device = torch.device(device)
+        self.vocab_size = vocab_size
+        self._max_seq_length = calibration_seq_length
+        self.calibration_seq_length = calibration_seq_length
+
+        self.input_prep_func = (
+            input_prep_func if input_prep_func is not None else lambda x: (x,)
+        )
+
+        self.pad_calibration_inputs = pad_calibration_inputs
+        self.pad_token = pad_token
+
+        # Initialize inputs as a list of two empty lists for input tensors and indices
+        self.inputs = [[], []]
+
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id()
+        except:
+            return self.tokenizer.eos_id
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def max_gen_toks(self):
+        return 50
+
+    @property
+    def batch_size(self):
+        return 1
+
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str, **kwargs):
+        tokens = self.tokenizer.encode(string)
+        if hasattr(self.tokenizer, "bos_id"):
+            try:
+                tokens = [self.tokenizer.bos_id()] + tokens
+            except:
+                tokens = [self.tokenizer.bos_id] + tokens
+        return tokens
+
+    def tok_decode(self, tokens):
+        decoded = self.tokenizer.decode(tokens)
+        return decoded
+
+    def add_input(self, args):
+        # Ensure that inputs are added correctly as pairs
+        self.inputs[0].append(args[0])
+        self.inputs[1].append(args[1])
+
+    def record_inputs(self, calibration_tasks, calibration_limit):
+        try:
+            lm_eval.tasks.initialize_tasks()
+        except:
+            pass
+
+        task_dict = get_task_dict(calibration_tasks)
+        print("Obtaining GPTQ calibration inputs on: ", calibration_tasks)
+
+        evaluate(
+            self,
+            task_dict,
+            limit=calibration_limit,
+        )
+        return self
+
+    def get_inputs(self):
+        # Return MultiTensor instances for both inputs and indices
+        return [MultiTensor(self.inputs[0]), MultiTensor(self.inputs[1])]
+
+    def _model_call(self, inps):
+        inps = inps.squeeze(0)
+        T = len(inps)
+        if (
+            # Can't use inputs that are too short when padding is disabled
+            (T < self.calibration_seq_length and not self.pad_calibration_inputs)
+            or
+            # Can't use inputs that actually use the token we use for padding
+            (self.pad_calibration_inputs and self.pad_token in inps)
+        ):
+            # Give random output
+            return torch.randn(
+                (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+            )
+
+        # Pad or truncate to the correct size
+        if T >= self.calibration_seq_length:
+            inps = inps[: self.calibration_seq_length]
+        else:
+            inps = F.pad(
+                inps, (0, self.calibration_seq_length - T), value=self.pad_token
+            )
+
+        inps = inps.unsqueeze(0)
+        model_in = self.input_prep_func(inps)
+
+        self.add_input(model_in)
+
+        # Output `something` with the correct shape to keep eval going
+        return torch.randn(
+            (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+        )
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+
+class InputRecorder(eval_wrapper):
+    """
+    This is a fake evaluation wrapper from the lm_eval library that just records the inputs
+    so that they can be used in calibration.
+
+    If pad_calibration_inputs is enabled, the input recorder will take
+    each input and pad/truncate it down to the calibration_seq_length.
+    (if using padding you should set the embeddings for the pad_token to 0
+    in the model)
+
+    Note: after padding/truncation, input_prep_function is called to bring
+    it to the proper form to be inserted into a given model.
+
+    If not, it will only truncate inputs to the desired length.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        calibration_seq_length,
+        input_prep_func=None,
+        pad_calibration_inputs=False,
+        vocab_size=32000,
+        pad_token=0,
+        device="cpu",
+    ):
+        try:
+            super().__init__()
+        except TypeError:
+            # lm_eval 0.4.2 removed the default init
+            super().__init__("gpt2", device="cpu")
+
+        self.tokenizer = tokenizer
+        self._device = torch.device(device)
+        self.vocab_size = vocab_size
+        self._max_seq_length = calibration_seq_length
+        self.calibration_seq_length = calibration_seq_length
+
+        # need to take inps and convert to corrent input
+        # for model
+        self.input_prep_func = (
+            input_prep_func if input_prep_func is not None else lambda x: (x,)
+        )
+
+        self.pad_calibration_inputs = pad_calibration_inputs
+        self.pad_token = pad_token
+
+        self.inputs = None
+
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id()
+        except:
+            return self.tokenizer.eos_id
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def max_gen_toks(self):
+        return 50
+
+    @property
+    def batch_size(self):
+        return 1
+
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str, **kwargs):
+        # TODO: verify this for multi-batch as well
+        tokens = self.tokenizer.encode(string)
+        if hasattr(self.tokenizer, "bos_id"):
+            try:
+                tokens = [self.tokenizer.bos_id()] + tokens
+            except:
+                tokens = [self.tokenizer.bos_id] + tokens
+        return tokens
+
+    def tok_decode(self, tokens):
+        decoded = self.tokenizer.decode(tokens)
+        return decoded
+
+    def add_input(self, args):
+        if self.inputs is None:
+            self.inputs = [_MultiInput([arg]) for arg in args]
+        else:
+            self.inputs = [
+                multi.add_input(arg) for (multi, arg) in zip(self.inputs, args)
+            ]
+
+    def record_inputs(
+        self,
+        calibration_tasks,
+        calibration_limit,
+    ):
+        try:
+            lm_eval.tasks.initialize_tasks()
+        except:
+            pass
+
+        task_dict = get_task_dict(calibration_tasks)
+        print("Obtaining GPTQ calibration inputs on: ", calibration_tasks)
+
+        evaluate(
+            self,
+            task_dict,
+            limit=calibration_limit,
+        )
+        return self
+
+    def get_inputs(self):
+        return self.inputs
+
+    def _model_call(self, inps):
+        inps = inps.squeeze(0)
+        T = len(inps)
+        if (
+            # can't use inputs that are too short when padding disabled
+            (T < self.calibration_seq_length and not self.pad_calibration_inputs)
+            or
+            # can't use inputs that actually use token we use for padding
+            (self.pad_calibration_inputs and self.pad_token in inps)
+        ):
+            # give random output
+            return torch.randn(
+                (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+            )
+
+        # pad or truncate to the right size
+        if T >= self.calibration_seq_length:
+            inps = inps[: self.calibration_seq_length]
+        else:
+            inps = F.pad(inps, (self.pad_token, self.calibration_seq_length - T))
+
+        inps = inps.unsqueeze(0)
+        model_in = self.input_prep_func(inps)
+
+        self.add_input(model_in)
+
+        # output `something` with correct shape to keep eval going
+        return torch.randn(
+            (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+        )
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+
+class TransformerEvalWrapper(InputRecorder):
+    """
+    A wrapper class for GPTFast, providing integration with the lm-evaluation-harness library.
+    """
+
+    def __init__(
+        self, model, tokenizer, max_seq_length, input_prep_func=None, device="cuda"
+    ):
+        super().__init__(tokenizer, None)
+        self._model = model
+        # self.tokenizer = tokenizer
+        self._device = torch.device(device)
+        self._max_seq_length = max_seq_length
+
+        # need to take inps and convert to corrent input
+        # for model
+        self.input_prep_func = (
+            input_prep_func if input_prep_func is not None else lambda x: (x,)
+        )
+
+    def _model_call(self, inps):
+        # TODO: make batches work
+        input = self.input_prep_func(inps)
+
+        max_seq_length = min(max(inps.size()), self.max_length)
+        with torch.device(self._device):
+            self._model.setup_caches(self.batch_size, max_seq_length)
+        logits = self._model(*input)
+        return logits
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+    def run_eval(self, tasks, limit):
+        try:
+            lm_eval.tasks.initialize_tasks()
+        except:
+            pass
+
+        task_dict = get_task_dict(tasks)
+        print("Evaluating Model On: ", task_dict)
+        with torch.no_grad():
+            result = evaluate(
+                self,
+                task_dict,
+                limit=limit,
+            )
+        for task, res in result["results"].items():
+            print(f"{task}: {res}")
+        return result
diff --git a/benchmarks/models/llama/benchmark_results.txt b/benchmarks/models/llama/benchmark_results.txt
new file mode 100644
index 0000000000..d59c5f552e
--- /dev/null
+++ b/benchmarks/models/llama/benchmark_results.txt
@@ -0,0 +1,52 @@
+README BENCHMARKS
+llama 2
+20240831225155, tok/s=107.38, mem/s=1418.93 GB/s, peak_mem=13.88 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831225810, tok/s=  9.61, mem/s=  63.67 GB/s, peak_mem= 8.61 GB, model_size= 6.62 GB quant: int8dq, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8dq --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831230013, tok/s=170.83, mem/s=1131.18 GB/s, peak_mem= 8.95 GB, model_size= 6.62 GB quant: int8wo, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910152454, tok/s=117.89, mem/s= 584.57 GB/s, peak_mem= 6.52 GB, model_size= 4.96 GB quant: fp6, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831230205, tok/s=201.14, mem/s= 751.42 GB/s, peak_mem= 4.87 GB, model_size= 3.74 GB quant: int4wo-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831230736, tok/s=177.45, mem/s=1194.35 GB/s, peak_mem= 8.64 GB, model_size= 6.73 GB quant: autoquant, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240902100527, tok/s=209.19, mem/s= 804.32 GB/s, peak_mem= 4.89 GB, model_size= 3.84 GB quant: autoquant-int4, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant-int4 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+
+llama 3
+20240831232535, tok/s= 95.64, mem/s=1435.54 GB/s, peak_mem=16.43 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831233224, tok/s=  8.61, mem/s=  64.75 GB/s, peak_mem= 9.24 GB, model_size= 7.52 GB quant: int8dq, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8dq --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831233853, tok/s=153.03, mem/s=1150.80 GB/s, peak_mem=10.42 GB, model_size= 7.52 GB quant: int8wo, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910153353, tok/s=161.58, mem/s= 910.02 GB/s, peak_mem= 7.72 GB, model_size= 5.63 GB quant: fp6, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831234218, tok/s=180.80, mem/s= 763.33 GB/s, peak_mem= 6.88 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831235355, tok/s=158.10, mem/s=1193.24 GB/s, peak_mem=10.04 GB, model_size= 7.55 GB quant: autoquant, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240902101015, tok/s=188.41, mem/s= 800.58 GB/s, peak_mem= 7.14 GB, model_size= 4.25 GB quant: autoquant-int4, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant-int4 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+
+KV CACHE QUANTIZATION:
+20240826161508, tok/s= 19.71, mem/s= 295.80 GB/s, peak_mem=17.86 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 8192
+20240826161747, tok/s= 13.52, mem/s= 202.96 GB/s, peak_mem=17.52 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 8192--kv_cache_quantization
+20240826162028, tok/s= 13.30, mem/s= 199.66 GB/s, peak_mem=17.47 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 8192--kv_cache_quantization --linear_causal_mask
+20240826162318, tok/s= 12.54, mem/s= 188.22 GB/s, peak_mem=19.81 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 16384
+20240826162620, tok/s= 10.67, mem/s= 160.12 GB/s, peak_mem=18.75 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 16384--kv_cache_quantization
+20240826162920, tok/s= 10.57, mem/s= 158.67 GB/s, peak_mem=18.48 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 16384--kv_cache_quantization --linear_causal_mask
+20240826163307, tok/s=  7.11, mem/s= 106.75 GB/s, peak_mem=23.83 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 32768
+20240826163710, tok/s=  6.33, mem/s=  94.98 GB/s, peak_mem=21.72 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 32768--kv_cache_quantization
+20240826164117, tok/s=  6.20, mem/s=  93.02 GB/s, peak_mem=20.64 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 32768--kv_cache_quantization --linear_causal_mask
+20240826164715, tok/s=  3.72, mem/s=  55.84 GB/s, peak_mem=33.50 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 65536
+20240826165343, tok/s=  3.32, mem/s=  49.90 GB/s, peak_mem=29.54 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 65536--kv_cache_quantization
+20240826170011, tok/s=  3.31, mem/s=  49.71 GB/s, peak_mem=25.24 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 65536--kv_cache_quantization --linear_causal_mask
+20240826171015, tok/s=  1.95, mem/s=  29.21 GB/s, peak_mem=59.27 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072
+20240826172121, tok/s=  1.73, mem/s=  26.02 GB/s, peak_mem=52.62 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072--kv_cache_quantization
+20240826173230, tok/s=  1.73, mem/s=  25.95 GB/s, peak_mem=34.18 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072--kv_cache_quantization --linear_causal_mask
+
+OTHER BENCHMARKS
+20240831224311, tok/s= 26.75, mem/s= 707.01 GB/s, peak_mem=27.23 GB, model_size=26.43 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float32, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float32 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831224512, tok/s= 22.97, mem/s= 303.53 GB/s, peak_mem=13.64 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831224958, tok/s=108.48, mem/s=1433.57 GB/s, peak_mem=13.90 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910004030, tok/s= 22.72, mem/s= 112.66 GB/s, peak_mem=10.41 GB, model_size= 4.96 GB quant: fp6, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910004539, tok/s= 50.99, mem/s= 200.08 GB/s, peak_mem= 6.29 GB, model_size= 3.92 GB quant: uintx-4-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910005147, tok/s= 40.25, mem/s= 265.95 GB/s, peak_mem= 9.24 GB, model_size= 6.61 GB quant: uintx-2-8, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-2-8 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910110554, tok/s=245.07, mem/s= 657.93 GB/s, peak_mem= 4.05 GB, model_size= 2.68 GB quant: sparse-marlin, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+
+20240831231514, tok/s= 26.54, mem/s= 796.59 GB/s, peak_mem=32.34 GB, model_size=30.02 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float32, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float32 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831231725, tok/s= 23.67, mem/s= 355.33 GB/s, peak_mem=16.19 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240831232327, tok/s= 96.59, mem/s=1449.85 GB/s, peak_mem=16.43 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910005537, tok/s= 20.22, mem/s= 113.89 GB/s, peak_mem=23.17 GB, model_size= 5.63 GB quant: fp6, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910010056, tok/s= 47.85, mem/s= 213.24 GB/s, peak_mem=11.85 GB, model_size= 4.46 GB quant: uintx-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910010647, tok/s= 34.83, mem/s= 261.42 GB/s, peak_mem=14.99 GB, model_size= 7.51 GB quant: uintx-2-8, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-2-8 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
+20240910110958, tok/s=223.95, mem/s= 682.88 GB/s, peak_mem= 5.59 GB, model_size= 3.05 GB quant: sparse-marlin, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
diff --git a/benchmarks/models/llama/benchmarks.sh b/benchmarks/models/llama/benchmarks.sh
new file mode 100644
index 0000000000..4c11b193d5
--- /dev/null
+++ b/benchmarks/models/llama/benchmarks.sh
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
+
+# README BENCHMARKS
+export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
+
+export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt
+
+export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
+# Runs on H100, float8 is not supported on CUDA arch < 8.9
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8wo --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8dq-tensor --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8dq-wo --write_result benchmark_results.txt
+
+# OTHER BENCHMARKS
+
+# kv cache quantization
+export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization --linear_causal_mask
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization --linear_causal_mask
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization --linear_causal_mask
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization --linear_causal_mask
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization --linear_causal_mask
+
+export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
+
+export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
+
+# Different Batch Size Benchmarks
+export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 1
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 128
+
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 1
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 128
+
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 1
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 128
+
+# TTFT benchmarks
+export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8000
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq --write_result benchmark_results.txt --prefill_size 8000
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8wo --write_result benchmark_results.txt --prefill_size 8000
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq --sparsity semi-structured --write_result benchmark_results.txt --prefill_size 8000
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization float8dq --write_result benchmark_results.txt --prefill_size 8000
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization float8wo --write_result benchmark_results.txt --prefill_size 8000
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int4wo-64 --write_result benchmark_results.txt --prefill_size 8000
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization sparse-marlin --write_result benchmark_results.txt --prefill_size 8000 --precision float16 --sparsity semi-structured
+
+# gemlite benchmarks
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-64  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-64  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-None  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-None  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-8-None  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-8-None  --write_result benchmark_results.txt
+
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-64  --write_result benchmark_results.txt  --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-64  --write_result benchmark_results.txt  --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-None  --write_result benchmark_results.txt  --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-None  --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-8-None  --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-8-None  --write_result benchmark_results.txt --batch_size 32
+
+# 2:4 sparse model
+export MODEL_REPO=nm-testing/SparseLlama-3-8B-pruned_50.2of4
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --sparsity semi-structured --precision float16 --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result benchmark_results.txt
diff --git a/benchmarks/models/llama/bsr_bench_results.txt b/benchmarks/models/llama/bsr_bench_results.txt
new file mode 100644
index 0000000000..09c4d6cbcd
--- /dev/null
+++ b/benchmarks/models/llama/bsr_bench_results.txt
@@ -0,0 +1,27 @@
+
+20250226151422, tok/s=133.29, tok/s_decode=134.40, ttft=0.0118, mem/s=2000.68 GB/s, peak_mem=16.30 GB, model_size=15.01 GB quant: None, sparse: None,       mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226151926, tok/s=242.08, tok/s_decode=256.68, ttft=0.0464, mem/s=1182.14 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226152416, tok/s=252.18, tok/s_decode=267.48, ttft=0.0448, mem/s=1229.49 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226153215, tok/s=204.19, tok/s_decode=213.86, ttft=0.0438, mem/s=1226.65 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226153628, tok/s=180.14, tok/s_decode=187.54, ttft=0.0433, mem/s=1081.56 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226160622, tok/s=246.20, tok/s_decode=255.21, ttft=0.0281, mem/s= 956.89 GB/s, peak_mem= 5.56 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226160651, tok/s=145.07, tok/s_decode=163.13, ttft=0.1522, mem/s=1461.87 GB/s, peak_mem=22.76 GB, model_size=10.08 GB quant: None, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+
+20250226161533, tok/s=250.71, tok/s_decode=254.78, ttft=0.0121, mem/s= 974.38 GB/s, peak_mem= 5.56 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226161913, tok/s=251.19, tok/s_decode=254.95, ttft=0.0112, mem/s= 976.26 GB/s, peak_mem= 5.63 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226181326, tok/s=134.44, tok/s_decode=140.82, ttft=0.0669, mem/s= 807.62 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226181520, tok/s=138.03, tok/s_decode=164.08, ttft=0.2295, mem/s=1390.97 GB/s, peak_mem=22.74 GB, model_size=10.08 GB quant: None, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226181738, tok/s=192.65, tok/s_decode=205.62, ttft=0.0649, mem/s=1157.32 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226182045, tok/s=192.75, tok/s_decode=206.24, ttft=0.0673, mem/s=1157.27 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226182350, tok/s=236.36, tok/s_decode=257.62, ttft=0.0693, mem/s=1154.19 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226182712, tok/s=231.24, tok/s_decode=250.55, ttft=0.0661, mem/s=1127.37 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226183255, tok/s=169.58, tok/s_decode=179.82, ttft=0.0665, mem/s=1018.74 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226183527, tok/s=184.74, tok/s_decode=196.38, ttft=0.0637, mem/s=1109.18 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226183734, tok/s=232.60, tok/s_decode=252.51, ttft=0.0673, mem/s=1135.85 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250226183953, tok/s=232.47, tok/s_decode=251.15, ttft=0.0635, mem/s=1133.40 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250227084325, tok/s=200.72, tok/s_decode=210.91, ttft=0.0475, mem/s=1205.82 GB/s, peak_mem= 8.00 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250227084708, tok/s=211.76, tok/s_decode=222.43, ttft=0.0447, mem/s=1271.42 GB/s, peak_mem= 7.99 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250227085051, tok/s=241.09, tok/s_decode=255.19, ttft=0.0452, mem/s=1177.31 GB/s, peak_mem= 6.47 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250227085446, tok/s=247.53, tok/s_decode=262.94, ttft=0.0468, mem/s=1206.80 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250227090411, tok/s=250.11, tok/s_decode=263.99, ttft=0.0416, mem/s=1219.39 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20250227091144, tok/s=249.14, tok/s_decode=263.74, ttft=0.0439, mem/s=1214.68 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
diff --git a/benchmarks/models/llama/bsr_benchmarks.sh b/benchmarks/models/llama/bsr_benchmarks.sh
new file mode 100644
index 0000000000..1ce4da8759
--- /dev/null
+++ b/benchmarks/models/llama/bsr_benchmarks.sh
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+# BSR benchmarks
+export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
+export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
+
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result bsr_bench_results.txt
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result bsr_bench_results.txt
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --sparsity semi-structured --precision float16 --write_result bsr_bench_results.txt
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.8-32
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.8-64
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.9-32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.9-64
diff --git a/benchmarks/models/llama/demo_summarize.sh b/benchmarks/models/llama/demo_summarize.sh
new file mode 100644
index 0000000000..6a9a11e254
--- /dev/null
+++ b/benchmarks/models/llama/demo_summarize.sh
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# grab moby dick prompt
+wget -nc -O moby.txt https://gist.githubusercontent.com/jcaip/f319146bb543e92e23b2c76815b0f29f/raw/31a9cd12b0b59f323eb197c9534953bdac352986/gistfile1.txt
+
+export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B-Instruct
+
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq_prefill_wo_decode --prefill_size 8192 --max_new_tokens 256 --num_samples 1 --demo_summarize_prompt moby.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8wo --prefill_size 8192 --max_new_tokens 256 --num_samples 1 --demo_summarize_prompt moby.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq --prefill_size 8192 --max_new_tokens 256 --num_samples 1 --demo_summarize_prompt moby.txt
diff --git a/benchmarks/models/llama/eval.py b/benchmarks/models/llama/eval.py
new file mode 100644
index 0000000000..4a67124a08
--- /dev/null
+++ b/benchmarks/models/llama/eval.py
@@ -0,0 +1,355 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import time
+from pathlib import Path
+from typing import List, Optional
+
+import torch
+from generate import (
+    _load_model,
+    device_sync,
+)
+from tokenizer import get_tokenizer
+
+import torchao
+from torchao._models.llama.model import prepare_inputs_for_model
+from torchao.quantization import (
+    PerRow,
+    PerTensor,
+    float8_dynamic_activation_float8_weight,
+    float8_weight_only,
+    fpx_weight_only,
+    int4_weight_only,
+    int8_dynamic_activation_int8_weight,
+    int8_weight_only,
+    quantize_,
+    uintx_weight_only,
+)
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, unwrap_tensor_subclass
+
+
+def run_evaluation(
+    checkpoint_path: Path,
+    tasks: List[str],
+    limit: Optional[int] = None,
+    device="cuda",
+    precision=torch.bfloat16,
+    quantization: Optional[str] = None,
+    sparsity: Optional[str] = None,
+    compile=False,
+    max_length=None,
+    calibration_tasks: Optional[List[str]] = None,
+    calibration_limit: Optional[int] = None,
+    calibration_seq_length: Optional[int] = None,
+    pad_calibration_inputs: Optional[bool] = False,
+):
+    """Runs the evaluation of a model using LM Eval."""
+    print(
+        f"\nEvaluating model {checkpoint_path} on tasks: {tasks}, limit: {limit}, device: {device}, precision: {precision}, "
+        + f"quantization: {quantization}, sparsity: {sparsity}, compile: {compile}, max_length: {max_length}, calibration_tasks: {calibration_tasks}, "
+        + f"calibration_seq_length: {calibration_seq_length}, pad_calibration_inputs: {pad_calibration_inputs}\n"
+    )
+    torchao.quantization.utils.recommended_inductor_config_setter()
+
+    assert checkpoint_path.is_file(), checkpoint_path
+    tokenizer_path = checkpoint_path.parent / "tokenizer.model"
+    assert tokenizer_path.is_file(), str(tokenizer_path)
+    # Load Model and Tokenizer
+    print("Loading model ...")
+    t0 = time.time()
+    model = _load_model(checkpoint_path, "cpu", precision)
+
+    if max_length is None:
+        max_length = model.config.block_size
+    device_sync(device=device)  # MKG
+    print(f"Time to load model: {time.time() - t0:.02f} seconds")
+    tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
+
+    if quantization:
+        if "spinquant" in quantization:
+            from torchao.prototype.spinquant import apply_spinquant
+
+            apply_spinquant(model)
+        if "int8wo" in quantization:
+            quantize_(model, int8_weight_only())
+        if "int8dq" in quantization:
+            quantize_(model, int8_dynamic_activation_int8_weight())
+        if "fp6" in quantization:
+            quantize_(model, fpx_weight_only(3, 2))
+        if "int4wo" in quantization and not "gptq" in quantization:
+            if "hqq" in quantization:
+                use_hqq = True
+            else:
+                use_hqq = False
+            groupsize = int(quantization.split("-")[1])
+            assert (
+                groupsize in [32, 64, 128, 256]
+            ), f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
+            quantize_(
+                model.to(device),
+                int4_weight_only(group_size=groupsize, use_hqq=use_hqq),
+            )
+        if "uintx" in quantization:
+            # uintx-nbits-groupsize
+            # "uintx-2-64"
+            if "hqq" in quantization:
+                use_hqq = True
+            else:
+                use_hqq = False
+            _quant_args = quantization.split("-")
+            nbits = int(_quant_args[1])
+            _NBITS_TO_DTYPE = {
+                1: torch.uint1,
+                2: torch.uint2,
+                3: torch.uint3,
+                4: torch.uint4,
+                5: torch.uint5,
+                6: torch.uint6,
+                7: torch.uint7,
+                8: torch.uint8,
+            }
+            dtype = _NBITS_TO_DTYPE[nbits]
+            group_size = int(_quant_args[2])
+            quantize_(model, uintx_weight_only(dtype, group_size, use_hqq=use_hqq))
+        if "marlin" in quantization:
+            from torchao.dtypes import MarlinSparseLayout
+
+            quantize_(model, int4_weight_only(layout=MarlinSparseLayout()))
+        if "int4wo" in quantization and "gptq" in quantization:
+            # avoid circular imports
+            from torchao._models._eval import MultiTensorInputRecorder
+            from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer
+
+            groupsize = int(quantization.split("-")[-2])
+            assert (
+                groupsize in [32, 64, 128, 256]
+            ), f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
+            assert (
+                precision == torch.bfloat16
+            ), f"{quantization} requires precision or bfloat16 but got {precision}"
+            assert "cuda" in device, "int4 gptq quantization only works on cuda"
+            inputs = (
+                MultiTensorInputRecorder(
+                    tokenizer,
+                    calibration_seq_length,
+                    prepare_inputs_for_model,
+                    pad_calibration_inputs,
+                    model.config.vocab_size,
+                    device="cpu",
+                )
+                .record_inputs(
+                    calibration_tasks,
+                    calibration_limit,
+                )
+                .get_inputs()
+            )
+
+            quantizer = Int4WeightOnlyGPTQQuantizer(group_size=groupsize, device=device)
+            model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length)
+            model = quantizer.quantize(model, inputs).to(device)
+        else:
+            if not TORCH_VERSION_AT_LEAST_2_5:
+                unwrap_tensor_subclass(model)
+        if "float8wo" in quantization:
+            quantize_(model, float8_weight_only())
+        if "float8dq" in quantization:
+            granularity = str(quantization.split("-")[-1])
+            if granularity == "tensor":
+                granularity = PerTensor()
+            elif granularity == "row":
+                granularity = PerRow()
+            else:
+                if granularity == "float8dq":
+                    granularity = PerTensor()
+                else:
+                    raise ValueError(f"Unknown granularity {granularity}")
+            quantize_(
+                model, float8_dynamic_activation_float8_weight(granularity=granularity)
+            )
+        if "autoround" in quantization:
+            from transformers import AutoTokenizer
+
+            from torchao._models.llama.model import TransformerBlock
+            from torchao.prototype.autoround.autoround_llm import (
+                quantize_model_with_autoround_,
+            )
+
+            _tokenizer = AutoTokenizer.from_pretrained(checkpoint_path.parent)
+            # parse args from quantization string:
+            #   autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>-<grad_acc_steps>-<c>
+            _quant_args = quantization.split("-")
+            _default_quant_args = [False, 200, 128, 8, 2048, 128, 1, 0]
+            _model_devie = _quant_args[1] if len(_quant_args) > 1 else device
+            _quant_args = _quant_args[2:]
+            (
+                quant_lm_head,
+                iters,
+                groupsize,
+                batch_size,
+                seqlen,
+                nsamples,
+                grad_acc_steps,
+                compile_optimization_process,
+            ) = [int(x) for x in _quant_args] + _default_quant_args[len(_quant_args) :]
+            model = model.to(_model_devie)
+            print(
+                (
+                    f"Quantizing model with autoround(iters={iters}, groupsize={groupsize}, "
+                    f"quant_lm_head={quant_lm_head}, batch_size={batch_size}, seqlen={seqlen}, nsamples={nsamples}, "
+                    f"gradient_accumulate_steps={grad_acc_steps}, "
+                    f"compile_optimization_process={compile_optimization_process})"
+                )
+            )
+            with torch.device(_model_devie):
+                model.setup_caches(
+                    max_batch_size=batch_size, max_seq_length=seqlen, training=True
+                )
+
+            if quant_lm_head:
+                is_target_module = (
+                    lambda mod, fqn: isinstance(mod, TransformerBlock)
+                    or "output" in fqn
+                )
+            else:
+                is_target_module = lambda mod, fqn: isinstance(mod, TransformerBlock)
+            quantize_model_with_autoround_(
+                model=model,
+                tokenizer=_tokenizer,
+                is_target_module=is_target_module,
+                bits=4,
+                seqlen=seqlen,
+                batch_size=batch_size,
+                iters=iters,
+                nsamples=nsamples,
+                gradient_accumulate_steps=grad_acc_steps,
+                compile_optimization_process=compile_optimization_process == 1,
+            )
+            model.to(device)
+            model.reset_caches()
+        if "codebook" in quantization:
+            from torchao.prototype.quantization.codebook import codebook_weight_only
+
+            model.to(device)
+            quantize_(
+                model, codebook_weight_only(dtype=torch.uint4, scale_block_size=64)
+            )
+
+    if compile:
+        model = torch.compile(model, mode="max-autotune", fullgraph=True)
+    with torch.no_grad():
+        print("Running evaluation ...")
+        # avoid circular imports
+        from torchao._models._eval import TransformerEvalWrapper
+
+        TransformerEvalWrapper(
+            model=model.to(device),
+            tokenizer=tokenizer,
+            max_seq_length=max_length,
+            input_prep_func=prepare_inputs_for_model,
+            device=device,
+        ).run_eval(
+            tasks=tasks,
+            limit=limit,
+        )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Run HF Model Evaluation")
+    parser.add_argument(
+        "--checkpoint_path",
+        type=Path,
+        default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"),
+        help="Model checkpoint path.",
+    )
+    parser.add_argument(
+        "--tasks",
+        nargs="+",
+        type=str,
+        default=["wikitext"],
+        help="List of lm-eluther tasks to evaluate usage: --tasks task1 task2",
+    )
+    parser.add_argument(
+        "--limit", type=int, default=None, help="Number of eval samples to evaluate"
+    )
+    parser.add_argument(
+        "--precision",
+        type=lambda x: getattr(torch, x.split(".")[-1]),
+        default=torch.bfloat16,
+        help="dtype precision to use",
+    )
+    parser.add_argument(
+        "--device", type=str, default="cuda", help="Device to use for evaluation"
+    )
+    parser.add_argument(
+        "-q",
+        "--quantization",
+        type=str,
+        help=(
+            "Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, "
+            "int4wo-<groupsize>-gptq, autoquant, autoquant-int4, int4wo-<groupsize>-hqq, "
+            "uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, "
+            "autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>-<grad_acc_steps>-<c>, "
+            "float8wo, float8dq, float8saq"
+        ),
+    )
+    parser.add_argument(
+        "--sparsity",
+        type=str,
+        help=("Which sparsity techniques to apply: semi-structured"),
+    )
+    parser.add_argument(
+        "--compile", action="store_true", help="Whether to compile the model."
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=None,
+        help="Length of text to process at one time",
+    )
+    parser.add_argument(
+        "--calibration_tasks",
+        type=str,
+        nargs="+",
+        default=["wikitext"],
+        help="tasks to do gptq calibration on, if doing gptq",
+    )
+    parser.add_argument(
+        "--calibration_limit",
+        type=int,
+        default=1000,
+        help="number of samples to use for gptq calibration",
+    )
+    parser.add_argument(
+        "--calibration_seq_length",
+        type=int,
+        default=100,
+        help="length of sequences to use for gptq calibration",
+    )
+    parser.add_argument(
+        "--pad_calibration_inputs",
+        type=bool,
+        default=False,
+        help="pads sequences shorter than calibration_seq_length to that length, yielding more calibration inputs but running much slower",
+    )
+
+    args = parser.parse_args()
+    run_evaluation(
+        args.checkpoint_path,
+        args.tasks,
+        args.limit,
+        args.device,
+        args.precision,
+        args.quantization,
+        args.sparsity,
+        args.compile,
+        args.max_length,
+        args.calibration_tasks,
+        args.calibration_limit,
+        args.calibration_seq_length,
+        args.pad_calibration_inputs,
+    )
diff --git a/benchmarks/models/llama/evals.sh b/benchmarks/models/llama/evals.sh
new file mode 100644
index 0000000000..b4c32644a7
--- /dev/null
+++ b/benchmarks/models/llama/evals.sh
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
+
+export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround        # auto-round w/o quant_lm_head
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head
+
+export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu    # auto-round w/o quant_lm_head
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head
+
+export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu    # auto-round w/o quant_lm_head
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8wo
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-tensor
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-row
+
+# Testing on additional tasks
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo --tasks 'winogrande' 'arc_challenge'
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo --tasks 'mmlu' 'truthfulqa_mc2'
diff --git a/benchmarks/models/llama/generate.py b/benchmarks/models/llama/generate.py
new file mode 100644
index 0000000000..6be6a31e0a
--- /dev/null
+++ b/benchmarks/models/llama/generate.py
@@ -0,0 +1,1255 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Tuple
+
+import torch
+import torch._dynamo.config
+import torch._inductor.config
+
+import torchao
+from torchao._models.utils import (
+    get_arch_name,
+    write_json_result_local,
+    write_json_result_ossci,
+)
+from torchao.quantization.quant_primitives import MappingType
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, get_model_size_in_bytes
+
+torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False
+torch.backends.cuda.enable_cudnn_sdp(True)
+
+
+class HostEvent:
+    def __init__(self):
+        self.event_time = None
+
+    def record(self):
+        self.event_time = time.perf_counter()
+
+    def elapsed_time(self, other_event):
+        if self.event_time is None:
+            raise ValueError("Event not recorded!")
+        # return ms to match cuda event
+        return abs(other_event.event_time - self.event_time) * 1000
+
+
+def device_timer(device):
+    if "cuda" in device:
+        return torch.cuda.Event(enable_timing=True)
+    elif ("cpu" in device) or ("mps" in device):
+        return HostEvent()
+    else:
+        print(f"device={device} is not yet suppported")
+
+
+def device_sync(device):
+    if "cuda" in device:
+        torch.cuda.synchronize(device)
+    elif "xpu" in device:
+        torch.xpu.synchronize(device)
+    elif ("cpu" in device) or ("mps" in device):
+        pass
+    else:
+        print(f"device={device} is not yet suppported")
+
+
+default_device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "xpu"
+    if torch.xpu.is_available()
+    else "cpu"
+)
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from torchao._models.llama.model import Transformer, prepare_inputs_for_model
+from torchao._models.llama.tokenizer import get_tokenizer
+
+
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+
+
+def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    logits = logits / max(temperature, 1e-5)
+
+    if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        pivot = v.select(-1, -1).unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+
+
+def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    probs = logits_to_probs(logits[:, -1], temperature, top_k)
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs
+
+
+def prefill(
+    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
+) -> torch.Tensor:
+    # input_pos: [B, S]
+    logits = model(x, input_pos)
+    return sample(logits, **sampling_kwargs)[0]
+
+
+def decode_one_token(
+    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # input_pos: [B, 1]
+    assert input_pos.shape[-1] == 1
+    logits = model(x, input_pos)
+    return sample(logits, **sampling_kwargs)
+
+
+def decode_n_tokens(
+    model: Transformer,
+    cur_token: torch.Tensor,
+    input_pos: torch.Tensor,
+    num_new_tokens: int,
+    callback=lambda _: _,
+    **sampling_kwargs,
+):
+    new_tokens, new_probs = [], []
+    for i in range(num_new_tokens):
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
+            next_token, next_prob = decode_one_token(
+                model, cur_token, input_pos, **sampling_kwargs
+            )
+            next_token, next_prob = next_token.clone(), next_prob.clone()
+            input_pos += 1
+            # in some instances not having this causes weird issues with the stored tokens when you run the next decode_one_token step
+            new_tokens.append(next_token.clone())
+            callback(new_tokens[-1])
+            new_probs.append(next_prob)
+            cur_token = next_token
+
+    return new_tokens, new_probs
+
+
+def model_forward(model, x, input_pos):
+    return model(x, input_pos)
+
+
+@torch.no_grad()
+def generate(
+    model: Transformer,
+    prompt: torch.Tensor,
+    max_new_tokens: int,
+    batch_size: int,
+    *,
+    interactive: bool,
+    callback=lambda x: x,
+    kv_cache_quantization: bool = False,
+    cache_size: Optional[int] = None,
+    linear_causal_mask: bool = False,
+    prefill_start_event: Optional[torch.cuda.Event] = None,
+    prefill_end_event: Optional[torch.cuda.Event] = None,
+    decode_start_event: Optional[torch.cuda.Event] = None,
+    decode_end_event: Optional[torch.cuda.Event] = None,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    """
+    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+    """
+
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    device = prompt.device
+    T = prompt.size(-1)
+
+    # calculate how many tokens to generate based on max_new_tokens and model's upper bound (block_size)
+    max_seq_length = (
+        min(T + max_new_tokens, model.config.block_size) if not interactive else 350
+    )
+    new_tokens = max_seq_length - T
+
+    # format model input
+    prompt, input_pos = prepare_inputs_for_model(prompt)
+    prompt = prompt.repeat(batch_size, 1)  # expand prompt based on batchsize
+
+    # full prompt+output will be stored in seq
+    seq = torch.empty(batch_size, max_seq_length, dtype=prompt.dtype, device=device)
+    seq[:, :T] = prompt
+
+    # setup model caches
+    with torch.device(device):
+        if cache_size is None:
+            cache_size = max_seq_length
+        assert (
+            cache_size >= max_seq_length
+        ), "need cache_size to be greater than max_new_tokens + size-of-prompt"
+        model.setup_caches(
+            max_batch_size=batch_size,
+            max_seq_length=cache_size,
+            kv_cache_quantization=kv_cache_quantization,
+            linear_causal_mask=linear_causal_mask,
+            prompt_length=T,
+        )
+
+    # execute prefill
+    if prefill_start_event is not None:
+        prefill_start_event.record()
+    next_token = prefill(
+        model, prompt.view(batch_size, -1), input_pos, **sampling_kwargs
+    ).clone()
+    seq[:, T] = next_token.squeeze()
+    if prefill_end_event is not None:
+        prefill_end_event.record()
+
+    # execute token generation
+    if decode_start_event is not None:
+        decode_start_event.record()
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    generated_tokens, _ = decode_n_tokens(
+        model,
+        next_token.view(batch_size, -1),
+        input_pos,
+        new_tokens - 1,
+        callback=callback,
+        **sampling_kwargs,
+    )
+    seq = torch.cat((seq[:, : T + 1], *generated_tokens), dim=-1)
+    if decode_end_event is not None:
+        decode_end_event.record()
+
+    return seq
+
+
+def encode_tokens(tokenizer, string, bos=True, device=default_device):
+    tokens = tokenizer.encode(string)
+    if bos:
+        tokens = [tokenizer.bos_id()] + tokens
+    return torch.tensor(tokens, dtype=torch.int, device=device)
+
+
+def _load_model(checkpoint_path, device, precision):
+    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
+    if "model" in checkpoint and "stories" in str(checkpoint_path):
+        checkpoint = checkpoint["model"]
+    with torch.device("meta"):
+        model = Transformer.from_name(checkpoint_path.parent.name)
+    model.load_state_dict(checkpoint, assign=True)
+    model = model.to(device=device, dtype=precision)
+
+    return model.eval()
+
+
+B_INST, E_INST = "[INST]", "[/INST]"
+
+
+def main(
+    prefill_size: Optional[int] = None,
+    prompt: str = "Hello, my name is",
+    demo_summarize_prompt: Optional[str] = None,
+    interactive: bool = False,
+    num_samples: int = 5,
+    max_new_tokens: int = 100,
+    batch_size: int = 1,
+    top_k: int = 200,
+    temperature: float = 0.8,
+    checkpoint_path: Path = Path(
+        "checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pth"
+    ),
+    quantization: Optional[str] = None,
+    min_sqnr: Optional[float] = None,
+    sparsity: Optional[str] = None,
+    kv_cache_quantization: bool = False,
+    cache_size: Optional[int] = None,
+    linear_causal_mask: bool = False,
+    save: bool = False,
+    compile: bool = True,
+    compile_prefill: bool = False,
+    profile: Optional[Path] = None,
+    memory_profile: Optional[Path] = None,
+    device=default_device,
+    precision=torch.bfloat16,
+    write_result: Optional[Path] = None,
+    output_json_path: Optional[Path] = None,
+    output_json_local: bool = False,
+) -> None:
+    """Generates text samples based on a pre-trained Transformer model and tokenizer."""
+
+    if prefill_size is not None and prefill_size > 0:
+        # create prompt of prefill size
+        if demo_summarize_prompt is None:
+            prompt = "prompt " * (int(prefill_size) - 2)
+        else:
+            with open(demo_summarize_prompt, "r") as f:
+                prompt = f.read()
+
+    torchao.quantization.utils.recommended_inductor_config_setter()
+
+    assert checkpoint_path.is_file(), checkpoint_path
+    tokenizer_path = checkpoint_path.parent / "tokenizer.model"
+    assert tokenizer_path.is_file(), str(tokenizer_path)
+
+    print(f"Using device={device}")
+    is_chat = "chat" in str(checkpoint_path)
+
+    print("Loading model ...")
+    t0 = time.time()
+    model = _load_model(checkpoint_path, device, precision)
+
+    device_sync(device=device)  # MKG
+    print(f"Time to load model: {time.time() - t0:.02f} seconds")
+
+    tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
+
+    encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)
+
+    if demo_summarize_prompt is not None:
+        end_tag = encode_tokens(tokenizer, "\n <END_TEXT>", bos=False, device=device)
+        encoded = encoded[: prefill_size - end_tag.size(0)]
+        encoded = torch.cat((encoded, end_tag), dim=0)
+
+    prompt_length = encoded.size(0)
+
+    torch.manual_seed(1234)
+
+    def ffn_only(mod, fqn):
+        return isinstance(mod, torch.nn.Linear) and "feed_forward" in fqn
+
+    def not_ffn_only(mod, fqn):
+        return isinstance(mod, torch.nn.Linear) and not ffn_only(mod, fqn)
+
+    def ffn_or_attn_only(mod, fqn):
+        return isinstance(mod, torch.nn.Linear) and (
+            "feed_forward" in fqn or "attention" in fqn
+        )
+
+    if quantization:
+        from torchao.quantization import (
+            Float8DynamicActivationFloat8SemiSparseWeightConfig,
+            autoquant,
+            float8_dynamic_activation_float8_weight,
+            float8_weight_only,
+            fpx_weight_only,
+            gemlite_uintx_weight_only,
+            int4_dynamic_activation_int4_weight,
+            int4_weight_only,
+            int8_dynamic_activation_int4_weight,
+            int8_dynamic_activation_int8_weight,
+            int8_weight_only,
+            quantize_,
+            uintx_weight_only,
+        )
+        from torchao.quantization.granularity import PerRow, PerTensor
+        from torchao.utils import unwrap_tensor_subclass
+
+        if "spinquant" in quantization:
+            from torchao.prototype.spinquant import apply_spinquant
+
+            apply_spinquant(model)
+        if quantization.startswith("gemlite"):
+            import os
+            import pwd
+
+            from gemlite.core import GemLiteLinearTriton
+
+            _quant_args = quantization.split("-")
+            bit_width = int(_quant_args[-2])
+            group_size = None if _quant_args[-1] == "None" else int(_quant_args[-1])
+            try:
+                packing_bitwidth = int(_quant_args[-3])
+            except:
+                # if only 2 inputs found, use default value
+                packing_bitwidth = 32
+
+            quantize_(
+                model,
+                gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth),
+            )
+
+            # try to load gemlite kernel config
+            try:
+                GemLiteLinearTriton.load_config(
+                    f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+                )
+                print(
+                    f"loaded gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+                )
+            except:
+                print(
+                    f"unable to load gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+                )
+
+            print("running gemlite warmup")
+            generate(
+                model,
+                encode_tokens(tokenizer, prompt, bos=True, device=device),
+                max_new_tokens,
+                batch_size,
+                interactive=False,
+                temperature=temperature,
+                top_k=top_k,
+            )
+            GemLiteLinearTriton.cache_config(
+                f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+            )
+        if "int8wo" in quantization:
+            quantize_(model, int8_weight_only())
+        if "int8dq" in quantization:
+            if sparsity and "semi" in sparsity:
+                from torchao.dtypes import SemiSparseLayout
+
+                quantize_(
+                    model,
+                    int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()),
+                    filter_fn=ffn_only,
+                )
+                quantize_(
+                    model, int8_dynamic_activation_int8_weight(), filter_fn=not_ffn_only
+                )
+            elif "int8dq_prefill_wo_decode" in quantization:
+                quantize_(
+                    model, int8_dynamic_activation_int8_weight(weight_only_decode=True)
+                )
+            else:
+                quantize_(model, int8_dynamic_activation_int8_weight())
+        if "int4wo" in quantization:
+            use_hqq = False
+            if "hqq" in quantization:
+                use_hqq = True
+            group_size = int(quantization.split("-")[1])
+            assert (
+                group_size
+                in [
+                    32,
+                    64,
+                    128,
+                    256,
+                ]
+            ), f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
+            quantize_(model, int4_weight_only(group_size=group_size, use_hqq=use_hqq))
+        elif "int4dq-" in quantization:
+            from torchao.dtypes import CutlassInt4PackedLayout
+
+            nbits = int(quantization.removeprefix("int4dq-"))
+            assert nbits == 4 or nbits == 8
+            if nbits == 4:
+                quantize_(
+                    model,
+                    int4_dynamic_activation_int4_weight(
+                        mapping_type=MappingType.SYMMETRIC,
+                        act_mapping_type=MappingType.SYMMETRIC,
+                        layout=CutlassInt4PackedLayout(),
+                    ),
+                )
+            elif nbits == 8:
+                quantize_(
+                    model,
+                    int8_dynamic_activation_int4_weight(
+                        group_size=None,
+                        mapping_type=MappingType.SYMMETRIC,
+                        act_mapping_type=MappingType.SYMMETRIC,
+                        layout=CutlassInt4PackedLayout(),
+                    ),
+                )
+        if "marlin" in quantization:
+            if "qqq" in quantization:
+                from torchao.dtypes import MarlinQQQLayout
+
+                quantize_(
+                    model,
+                    int8_dynamic_activation_int4_weight(
+                        group_size=128,
+                        mapping_type=MappingType.SYMMETRIC,
+                        act_mapping_type=MappingType.SYMMETRIC,
+                        layout=MarlinQQQLayout(),
+                    ),
+                )
+            elif "semi" in sparsity:
+                from torchao.dtypes import MarlinSparseLayout
+
+                quantize_(
+                    model,
+                    int4_weight_only(layout=MarlinSparseLayout()),
+                    filter_fn=ffn_or_attn_only,
+                )
+        if "fp6" in quantization:
+            quantize_(model, fpx_weight_only(3, 2))
+        elif "embed-int8wo" in quantization:
+            quantize_(
+                model,
+                int8_weight_only(group_size=64),
+                filter_fn=lambda x, *args: isinstance(x, torch.nn.Embedding),
+            )
+        elif quantization.startswith("awq"):
+            from torchao._models._eval import TransformerEvalWrapper
+            from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
+
+            if not TORCH_VERSION_AT_LEAST_2_3:
+                print("Awq requires torch2.3+")
+                exit()
+            from torchao.prototype.awq import (
+                AWQObservedLinear,
+                awq_uintx,
+                insert_awq_observer_,
+            )
+
+            quant_dtype = quantization.split("-")[1]
+            group_size = int(quantization.split("-")[2])
+            quant_dtype = getattr(torch, quant_dtype, torch.uint8)
+            model = model.to(device)
+            # get calibration data
+            insert_awq_observer_(
+                model, 1, 256, quant_dtype=quant_dtype, group_size=group_size
+            )
+            TransformerEvalWrapper(
+                model=model.to(device),
+                tokenizer=tokenizer,
+                max_seq_length=256,
+                input_prep_func=prepare_inputs_for_model,
+                device=device,
+            ).run_eval(
+                tasks=["wikitext"],
+                limit=1,
+            )
+            is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
+            use_hqq = "hqq" in quantization
+            quantize_(
+                model,
+                awq_uintx(
+                    quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq
+                ),
+                is_observed_linear,
+            )
+        elif "uintx" in quantization:
+            # uintx-nbits-group_size, e.g. "uintx-2-64"
+            if "hqq" in quantization:
+                # uintx-nbits-group_size-hqq
+                use_hqq = True
+            else:
+                use_hqq = False
+            _quant_args = quantization.split("-")
+            nbits = int(_quant_args[1])
+            assert nbits >= 1 and nbits <= 8, "nbits must be 1 to 8"
+            _NBITS_TO_DTYPE = {
+                1: torch.uint1,
+                2: torch.uint2,
+                3: torch.uint3,
+                4: torch.uint4,
+                5: torch.uint5,
+                6: torch.uint6,
+                7: torch.uint7,
+                8: torch.uint8,
+            }
+            dtype = _NBITS_TO_DTYPE[nbits]
+            group_size = int(_quant_args[2])
+            quantize_(model, uintx_weight_only(dtype, group_size, use_hqq=use_hqq))
+        elif "int8_dynamic_activation_intx_weight" in quantization:
+            from torchao.experimental.quant_api import (
+                int8_dynamic_activation_intx_weight,
+            )
+            from torchao.quantization.granularity import PerGroup
+
+            assert (
+                precision == torch.float32
+            ), "int8_dynamic_activation_intx_weight requires using precision=torch.float32"
+
+            # Quantize model
+            _quant_args = quantization.split("-")
+            weight_dtype = getattr(torch, f"int{_quant_args[1]}")
+            granularity = PerGroup(int(_quant_args[2]))
+            has_weight_zeros = bool(_quant_args[3])
+            quantize_(
+                model,
+                int8_dynamic_activation_intx_weight(
+                    weight_dtype=weight_dtype,
+                    granularity=granularity,
+                    has_weight_zeros=has_weight_zeros,
+                ),
+            )
+        elif "float8wo" in quantization:
+            quantize_(model, float8_weight_only())
+        elif "float8dq" in quantization:
+            if sparsity and "semi" in sparsity:
+                quantize_(
+                    model,
+                    Float8DynamicActivationFloat8SemiSparseWeightConfig(),
+                    filter_fn=ffn_only,
+                )
+            else:
+                granularity = str(quantization.split("-")[-1])
+                if granularity == "tensor":
+                    granularity = PerTensor()
+                elif granularity == "row":
+                    granularity = PerRow()
+                else:
+                    granularity = PerTensor()
+                quantize_(
+                    model,
+                    float8_dynamic_activation_float8_weight(granularity=granularity),
+                )
+        elif "autoquant_v2" in quantization:
+            from torchao._models._eval import InputRecorder
+            from torchao._models.llama.model import prepare_inputs_for_model
+            from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
+
+            calibration_seq_length = 256
+            inputs = (
+                InputRecorder(
+                    tokenizer,
+                    calibration_seq_length,
+                    prepare_inputs_for_model,
+                    False,  # pad_calibration_inputs
+                    model.config.vocab_size,
+                    device="cuda",
+                )
+                .record_inputs(
+                    ["wikitext"],
+                    1,
+                )
+                .get_inputs()[0]
+                .values[0]
+            )
+            inputs = prepare_inputs_for_model(inputs)
+            with torch.device("cuda"):
+                model.setup_caches(
+                    max_batch_size=1, max_seq_length=calibration_seq_length
+                )
+
+            if "autoquant_v2-int4" == quantization:
+                model = autoquant_v2(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.prototype.quantization.autoquant_v2.DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    batch_size=calibration_seq_length,
+                )
+            elif "autoquant_v2-float8" == quantization:
+                model = autoquant_v2(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.prototype.quantization.autoquant_v2.OTHER_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    batch_size=calibration_seq_length,
+                )
+            elif "autoquant_v2-fp" == quantization:
+                model = autoquant_v2(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.prototype.quantization.autoquant_v2.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    batch_size=calibration_seq_length,
+                )
+            elif "autoquant_v2-all" == quantization:
+                all_qtensor_classes = (
+                    torchao.prototype.quantization.autoquant_v2.DEFAULT_AUTOQUANT_CLASS_LIST
+                    + torchao.prototype.quantization.autoquant_v2.DEFAULT_INT4_AUTOQUANT_CLASS_LIST
+                    + torchao.prototype.quantization.autoquant_v2.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST
+                )
+                if torchao.utils.is_sm_89():
+                    # this is fp8 related subclasses, should rename
+                    all_qtensor_classes += torchao.prototype.quantization.autoquant_v2.OTHER_AUTOQUANT_CLASS_LIST
+                model = autoquant_v2(
+                    model,
+                    manual=True,
+                    qtensor_class_list=all_qtensor_classes,
+                    example_input=inputs,
+                    batch_size=calibration_seq_length,
+                )
+            else:
+                model = autoquant_v2(
+                    model,
+                    manual=True,
+                    example_input=inputs,
+                    batch_size=calibration_seq_length,
+                )
+
+            print("running generate")
+            generate(
+                model,
+                encode_tokens(tokenizer, prompt, bos=True, device=device),
+                max_new_tokens,
+                batch_size,
+                interactive=False,
+                temperature=temperature,
+                top_k=top_k,
+            )
+
+            print("running finalize autoquant")
+            # do autoquantization
+            model.finalize_autoquant()
+        elif "autoquant" in quantization:
+            from torchao._models._eval import InputRecorder
+            from torchao._models.llama.model import prepare_inputs_for_model
+
+            calibration_seq_length = 256
+            inputs = (
+                InputRecorder(
+                    tokenizer,
+                    calibration_seq_length,
+                    prepare_inputs_for_model,
+                    False,  # pad_calibration_inputs
+                    model.config.vocab_size,
+                    device="cuda",
+                )
+                .record_inputs(
+                    ["wikitext"],
+                    1,
+                )
+                .get_inputs()[0]
+                .values[0]
+            )
+            inputs = prepare_inputs_for_model(inputs)
+            with torch.device("cuda"):
+                model.setup_caches(
+                    max_batch_size=1, max_seq_length=calibration_seq_length
+                )
+
+            if "autoquant-int4" == quantization:
+                model = autoquant(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    min_sqnr=min_sqnr,
+                )
+            elif "autoquant-float8" == quantization:
+                model = autoquant(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    min_sqnr=min_sqnr,
+                )
+            elif "autoquant-fp" == quantization:
+                model = autoquant(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    min_sqnr=min_sqnr,
+                )
+            elif "autoquant-sparse" == quantization:
+                model = autoquant(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.quantization.DEFAULT_SPARSE_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    min_sqnr=min_sqnr,
+                )
+            elif "autoquant-gemlite-int4" == quantization:
+                import os
+                import pwd
+
+                from gemlite.core import GemLiteLinearTriton
+
+                GemLiteLinearTriton.load_config(
+                    f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+                )
+                model = autoquant(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.quantization.GEMLITE_INT4_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    min_sqnr=min_sqnr,
+                )
+            elif "autoquant-all" == quantization:
+                try:
+                    import os
+                    import pwd
+
+                    from gemlite.core import GemLiteLinearTriton
+
+                    GemLiteLinearTriton.load_config(
+                        f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+                    )
+                except:
+                    pass
+
+                model = autoquant(
+                    model,
+                    manual=True,
+                    qtensor_class_list=torchao.quantization.ALL_AUTOQUANT_CLASS_LIST,
+                    example_input=inputs,
+                    min_sqnr=min_sqnr,
+                )
+            else:
+                model = autoquant(
+                    model, manual=True, example_input=inputs, min_sqnr=min_sqnr
+                )
+
+            generate(
+                model,
+                encode_tokens(tokenizer, prompt, bos=True, device=device),
+                max_new_tokens,
+                batch_size,
+                interactive=False,
+                temperature=temperature,
+                top_k=top_k,
+            )
+
+            # do autoquantization
+            model.finalize_autoquant()
+        elif "codebook" in quantization:
+            from torchao.prototype.quantization.codebook import codebook_weight_only
+
+            model.to(device)
+            quantize_(
+                model, codebook_weight_only(dtype=torch.uint4, scale_block_size=64)
+            )
+
+        else:
+            if not TORCH_VERSION_AT_LEAST_2_5:
+                unwrap_tensor_subclass(model)
+
+    # standalone sparsity
+    elif sparsity:
+        from torchao.sparsity import semi_sparse_weight, sparsify_
+
+        if "semi" in sparsity:
+            # Fixed sparsity level for 2:4
+            sparsify_(model.to(device), semi_sparse_weight(), filter_fn=ffn_only)
+
+        if "bsr" in sparsity:
+            from torchao.sparsity import SupermaskLinear, block_sparse_weight
+
+            # parse "bsr-0.9-64"
+            _, sparsity_level, blocksize = sparsity.split("-")
+            sparsity_level, blocksize = float(sparsity_level), int(blocksize)
+            sparsify_(
+                model,
+                lambda x: SupermaskLinear.from_linear(
+                    x,
+                    sparsity_level=sparsity_level,
+                    blocksize=blocksize,
+                ),
+                filter_fn=ffn_only,
+            )
+            print(model)
+            sparsify_(
+                model,
+                SupermaskLinear.to_linear,
+                filter_fn=ffn_only,
+            )
+            print(model)
+
+            # Accelerate with triton bsr kernels
+            sparsify_(
+                model, block_sparse_weight(blocksize=blocksize), filter_fn=ffn_only
+            )
+
+    model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9
+
+    if save:
+        output_dir = str(checkpoint_path.cwd())
+        filename = str(checkpoint_path.name).split(".")[0]
+        torch.save(
+            model.state_dict(),
+            os.path.join(output_dir, filename + f"-{quantization}.pt"),
+        )
+
+    if compile:
+        print("Compiling Model")
+        global decode_one_token, prefill
+        decode_one_token = torch.compile(
+            decode_one_token,
+            mode="reduce-overhead",
+            fullgraph=True,
+        )
+
+        if compile_prefill:
+            prefill = torch.compile(prefill, fullgraph=True, dynamic=True)
+
+    if memory_profile:
+        if device == "cuda":
+            torch.cuda.memory._record_memory_history(
+                True, trace_alloc_max_entries=250000, trace_alloc_record_context=True
+            )
+        elif device == "xpu":
+            torch.xpu.memory._record_memory_history(
+                True, trace_alloc_max_entries=250000, trace_alloc_record_context=True
+            )
+        else:
+            print("Memory profiling only works on CUDA or XPU devices")
+
+    aggregate_metrics = {
+        "tokens_per_sec": [],
+        "time": [],
+        "decode_tokens_per_sec": [],
+        "prefill_time": [],
+    }
+    start = -1 if compile else 0
+
+    for i in range(start, num_samples):
+        if i == 0:
+            if device == "cuda":
+                torch.cuda.reset_peak_memory_stats()  # MKG
+            elif device == "xpu":
+                torch.xpu.reset_peak_memory_stats()  # MKG
+        device_sync(device=device)  # MKG
+        if i >= 0 and interactive:
+            prompt = input("What is your prompt? ")
+            if is_chat:
+                prompt = f"{B_INST} {prompt.strip()} {E_INST}"
+            encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)
+
+        if interactive and i >= 0 and prefill_size is None:
+            buffer = []
+            period_id = tokenizer.encode(".")[0]
+            done_generating = False
+
+            def callback(x):
+                nonlocal done_generating
+                if done_generating:
+                    return
+                buffer.append(tokenizer.decode([period_id] + x.squeeze(0).tolist())[1:])
+                if x.item() == tokenizer.eos_id():
+                    done_generating = True
+                if len(buffer) == 4 or done_generating:
+                    print("".join(buffer), end="", flush=True)
+                    buffer.clear()
+                # print(, end="", flush=True)
+
+        elif demo_summarize_prompt is not None and i >= 0:
+            buffer = []
+            period_id = tokenizer.encode(".")[0]
+
+            def callback(x):
+                buffer.append(tokenizer.decode([period_id] + x.squeeze(0).tolist())[1:])
+                if len(buffer) == 4:
+                    print("".join(buffer), end="", flush=True)
+                    buffer.clear()
+        else:
+            callback = lambda x: x
+        t0 = time.perf_counter()
+        prefill_start_event, prefill_end_event = (
+            device_timer(device),
+            device_timer(device),
+        )
+        decode_start_event, decode_end_event = (
+            device_timer(device),
+            device_timer(device),
+        )
+        import contextlib
+
+        if i != num_samples - 1 or not profile:
+            prof = contextlib.nullcontext()
+        else:
+            torch.profiler._utils._init_for_cuda_graphs()
+            prof = torch.profiler.profile()
+        with prof:
+            y = generate(
+                model,
+                encoded,
+                max_new_tokens,
+                batch_size,
+                interactive=interactive,
+                callback=callback,
+                temperature=temperature,
+                top_k=top_k,
+                kv_cache_quantization=kv_cache_quantization,
+                cache_size=cache_size,
+                linear_causal_mask=linear_causal_mask,
+                prefill_start_event=prefill_start_event,
+                prefill_end_event=prefill_end_event,
+                decode_start_event=decode_start_event,
+                decode_end_event=decode_end_event,
+            )
+        if i < 0:
+            print(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
+            continue
+        if hasattr(prof, "export_chrome_trace"):
+            prof.export_chrome_trace(f"{profile}.json")
+        device_sync(device=device)  # MKG
+        t = time.perf_counter() - t0
+
+        if not interactive and demo_summarize_prompt is None and prefill_size is None:
+            tok_list = y[0].tolist()
+            # truncate text after end of string token
+            tokens = (
+                tok_list
+                if tokenizer.eos_id() not in tok_list
+                else tok_list[: tok_list.index(tokenizer.eos_id())]
+            )
+            print(tokenizer.decode(tokens))
+        else:
+            print("\n")
+        tokens_generated = y.size(-1) - prompt_length
+        tokens_sec = tokens_generated / t
+        aggregate_metrics["tokens_per_sec"].append(tokens_sec)
+        aggregate_metrics["time"].append(t)
+        decode_time = decode_start_event.elapsed_time(decode_end_event) / 1000
+        decode_tokens_sec = tokens_generated / decode_time
+        aggregate_metrics["decode_tokens_per_sec"].append(decode_tokens_sec)
+        prefill_time = prefill_start_event.elapsed_time(prefill_end_event) / 1000
+        aggregate_metrics["prefill_time"].append(prefill_time)
+        print(
+            f"Sample {i+1} | overall time {t:.04f} s {tokens_sec:.02f} tokens/sec",
+            f"| prefill time {prefill_time:.04f} s decode {decode_tokens_sec:.02f} tokens/sec",
+        )
+        print(f"Bandwidth achieved: {model_size * tokens_sec:.02f} GB/s")
+
+        if memory_profile and i == 0:
+            if device == "cuda":
+                snapshot = torch.cuda.memory._snapshot()
+            elif device == "xpu":
+                snapshot = torch.xpu.memory._snapshot()
+            else:
+                print("Memory profiling only works on CUDA or XPU devices")
+
+            with open(f"{memory_profile}.pickle", "wb") as f:
+                from pickle import dump
+
+                dump(snapshot, f)
+            print(
+                f"\nmemory profile {memory_profile}.pickle saved, to convert that to a usable file, use",
+                "python pytorch/torch/cuda/_memory_viz.py trace_plot <pickle file> -o <desired output name>.html",
+            )
+            break
+    print("==========")
+
+    # ignore first sample for warmup
+    tokpersec = torch.mean(torch.tensor(aggregate_metrics["tokens_per_sec"])).item()
+    ttft = torch.mean(torch.tensor(aggregate_metrics["prefill_time"])).item()
+    decode_tokpersec = torch.mean(
+        torch.tensor(aggregate_metrics["decode_tokens_per_sec"])
+    ).item()
+    bandwidth = model_size * tokpersec
+    mem = torch.cuda.max_memory_reserved() / 1e9
+    print(f"Average overall tokens/sec: {tokpersec:.2f}")
+    print(f"Average decode tokens/sec: {decode_tokpersec:.04f} s")
+    print(f"Average TTFT: {ttft:.04f} s")
+    if device == "cuda":
+        mem = torch.cuda.max_memory_reserved() / 1e9
+    elif device == "xpu":
+        mem = torch.xpu.max_memory_reserved() / 1e9
+    print(f"Average tokens/sec: {tokpersec:.2f}")
+    if batch_size > 1:
+        print(f"Average tokens/sec including batches {batch_size*tokpersec:.2f}")
+    print(f"Average Bandwidth: {bandwidth:.02f} GB/s")
+    print(f"Peak Memory Usage: {mem:.02f} GB")
+    print(f"Model Size: {model_size:.02f} GB")
+    if write_result:
+        result_txt = f"\n{datetime.today().strftime('%Y%m%d%H%M%S')}, tok/s={tokpersec:6.2f}, tok/s_decode={decode_tokpersec:6.2f}, ttft={ttft:5.4f}, mem/s={bandwidth:7.2f} GB/s, peak_mem={mem:5.2f} GB, model_size={model_size:5.2f} GB "
+        result_txt += f"quant: {quantization}, sparse: {sparsity}, mod: {checkpoint_path.parent.name}, kv_quant: {kv_cache_quantization}, compile: {compile}, compile_prefill: {compile_prefill}, dtype: {precision}, device: {device} "
+        result_txt += "repro: python generate.py "
+        result_txt += f"--quantization {quantization} " if quantization else ""
+        result_txt += f"--sparsity {sparsity} " if sparsity else ""
+        result_txt += f"--checkpoint_path {checkpoint_path} "
+        result_txt += f"--device {device} "
+        result_txt += f"--precision {precision} "
+        result_txt += "--compile " if compile else ""
+        result_txt += "--compile_prefill " if compile_prefill else ""
+        result_txt += f"--prefill_size {prefill_size}" if prefill_size else ""
+        result_txt += f"--profile {profile} " if profile else ""
+        result_txt += f"--profile {memory_profile} " if memory_profile else ""
+        result_txt += "--interactive " if interactive else ""
+        result_txt += f"--num_samples {num_samples} "
+        result_txt += f"--max_new_tokens {max_new_tokens} "
+        result_txt += f"--batch_size {batch_size} "
+        result_txt += f"--top_k {top_k} "
+        result_txt += f"--temperature {temperature} "
+        result_txt += f"--cache_size {cache_size}" if cache_size else ""
+        result_txt += "--kv_cache_quantization " if kv_cache_quantization else ""
+        result_txt += "--linear_causal_mask " if linear_causal_mask else ""
+
+        f = open(write_result, "a")
+        f.write(result_txt)
+        f.close()
+
+    if output_json_path:
+        headers = [
+            "name",
+            "dtype",
+            "min_sqnr",
+            "compile",
+            "device",
+            "arch",
+            "metric",
+            "actual",
+            "target",
+        ]
+        name = checkpoint_path.parent.name
+        arch = get_arch_name()
+        dtype = quantization or "noquant"
+        memory_result = [
+            name,
+            dtype,
+            min_sqnr,
+            compile,
+            device,
+            arch,
+            "mem/s",
+            bandwidth,
+            None,
+        ]
+        performance_result = [
+            name,
+            dtype,
+            min_sqnr,
+            compile,
+            device,
+            arch,
+            "tok/s",
+            tokpersec,
+            None,
+        ]
+        write_json_result = (
+            write_json_result_local if output_json_local else write_json_result_ossci
+        )
+        write_json_result(output_json_path, headers, memory_result)
+        write_json_result(output_json_path, headers, performance_result)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Your CLI description.")
+    parser.add_argument(
+        "--prefill_size", type=int, default=None, help="Whether to run in ttft mode"
+    )
+    parser.add_argument(
+        "--prompt", type=str, default="Hello, my name is", help="Input prompt."
+    )
+    parser.add_argument(
+        "--demo_summarize_prompt", type=str, help="Read prompt from text file"
+    )
+    parser.add_argument(
+        "--interactive",
+        action="store_true",
+        help="Whether to launch in interactive mode",
+    )
+    parser.add_argument("--num_samples", type=int, default=5, help="Number of samples.")
+    parser.add_argument(
+        "--max_new_tokens", type=int, default=200, help="Maximum number of new tokens."
+    )
+    parser.add_argument(
+        "--batch_size", type=int, default=1, help="Batch size to benchmark with"
+    )
+    parser.add_argument("--top_k", type=int, default=200, help="Top-k for sampling.")
+    parser.add_argument(
+        "--temperature", type=float, default=0.8, help="Temperature for sampling."
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=Path,
+        default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"),
+        help="Model checkpoint path.",
+    )
+    parser.add_argument(
+        "-q",
+        "--quantization",
+        type=str,
+        help=(
+            "Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-hqq, autoquant, "
+            + "autoquant-int4, autoquant-gemlite-int4, autoquant-float8, autoquant-sparse, autoquant-all, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, "
+            + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>"
+        ),
+    )
+    parser.add_argument(
+        "--min_sqnr",
+        type=float,
+        default=None,
+        help=(
+            "min sqnr for quantizing v.s. not quantizing a layer, used in autoquant options",
+        ),
+    )
+    parser.add_argument(
+        "-s",
+        "--sparsity",
+        type=str,
+        help=("Which sparsity techniques to apply: semi-structured"),
+    )
+    parser.add_argument(
+        "--kv_cache_quantization",
+        action="store_true",
+        help="Whether to quantize the KV cache",
+    )
+    parser.add_argument(
+        "--cache_size",
+        type=int,
+        default=None,
+        help="Force size of cache to be a certain number of tokens, if not set, will use max_new_tokens+prompt_size",
+    )
+    parser.add_argument(
+        "--linear_causal_mask",
+        action="store_true",
+        help="Whether to use the memory efficient, but slightly less fast, linear causal mask (important for long context lengths)",
+    )
+    parser.add_argument(
+        "--save", action="store_true", help="Whether to save the quantized model."
+    )
+    parser.add_argument(
+        "--compile", action="store_true", help="Whether to compile the model."
+    )
+    parser.add_argument(
+        "--compile_prefill",
+        action="store_true",
+        help="Whether to compile the prefill (improves prefill perf, but higher compile times)",
+    )
+    parser.add_argument("--profile", type=Path, default=None, help="Profile path.")
+    parser.add_argument(
+        "--memory_profile", type=Path, default=None, help="filename for memory profile."
+    )
+    parser.add_argument(
+        "--device", type=str, default=default_device, help="Device to use"
+    )
+    parser.add_argument(
+        "--precision",
+        type=lambda x: getattr(torch, x.split(".")[-1]),
+        default=torch.bfloat16,
+        help="dtype precision to use",
+    )
+    parser.add_argument(
+        "--write_result", type=Path, default=None, help="Path where to write the result"
+    )
+    parser.add_argument(
+        "--output_json_path",
+        type=Path,
+        default=None,
+        help="Path where to write the json result for dashboard",
+    )
+    parser.add_argument(
+        "--output_json_local",
+        action="store_true",
+        help="Whether to output json result for local machine or for CI machine, local option will fill in some dummy fields",
+    )
+
+    args = parser.parse_args()
+    print(args)
+    main(
+        args.prefill_size,
+        args.prompt,
+        args.demo_summarize_prompt,
+        args.interactive,
+        args.num_samples,
+        args.max_new_tokens,
+        args.batch_size,
+        args.top_k,
+        args.temperature,
+        args.checkpoint_path,
+        args.quantization,
+        args.min_sqnr,
+        args.sparsity,
+        args.kv_cache_quantization,
+        args.cache_size,
+        args.linear_causal_mask,
+        args.save,
+        args.compile,
+        args.compile_prefill,
+        args.profile,
+        args.memory_profile,
+        args.device,
+        args.precision,
+        args.write_result,
+        args.output_json_path,
+        args.output_json_local,
+    )
diff --git a/benchmarks/models/sam/benchmark.sh b/benchmarks/models/sam/benchmark.sh
new file mode 100755
index 0000000000..e446f62cbc
--- /dev/null
+++ b/benchmarks/models/sam/benchmark.sh
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# baseline
+python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --print_header True
+# int8 dynamic quant (all)
+python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress int8_dynamic_quant
+# 2:4 sparsity (all)
+python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress sparse_mlp_only
+# 2:4 sparsity (mlp only)
+python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress sparse
+# int8 dynamic quant + 2:4 sparsity (attn: int8, mlp lin1: int8+2:4 fuse mul, mlp lin2: 2:4 sparse)
+python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress int8_dynamic_quant_sparse
+# int8 dynamic quant attn + int4 wo + sparse marlin lin 1 + 2:4 sparse lin2
+python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half float16 --device cuda --compress int4_weight_only_sparse
diff --git a/benchmarks/models/sam/eval_combo.py b/benchmarks/models/sam/eval_combo.py
new file mode 100644
index 0000000000..20f7f3ed2c
--- /dev/null
+++ b/benchmarks/models/sam/eval_combo.py
@@ -0,0 +1,692 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import resource
+import time
+
+import fire
+import torch
+import tqdm
+from data import build_data, setup_coco_img_ids
+from metrics import calculate_miou, create_result_entry
+
+import torchao
+from torchao._models.utils import (
+    get_arch_name,
+    write_json_result_local,
+    write_json_result_ossci,
+)
+from torchao.dtypes import SemiSparseLayout
+from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
+from torchao.quantization import (
+    autoquant,
+    int4_weight_only,
+    int8_dynamic_activation_int8_weight,
+    quantize_,
+)
+from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, unwrap_tensor_subclass
+
+torch._dynamo.config.cache_size_limit = 50000
+
+
+def unbind_jagged(device, data, sizes, offsets):
+    if data is None:
+        return None
+    data = data.to(device=device, non_blocking=True)
+    return [
+        data[offsets[batch_idx] : offsets[batch_idx + 1]].view(sizes[batch_idx])
+        for batch_idx in range(len(sizes))
+    ]
+
+
+PADDED_TENSOR = None
+
+
+# Preallocate a "landing" Tensor for incoming data and reuse it across launches.
+def pad_to_batch_size(batch, batch_size, device):
+    assert batch.dim() == 4
+    # assert batch.is_pinned()
+    global PADDED_TENSOR
+    if PADDED_TENSOR is None:
+        batch = batch.to(device=device, non_blocking=True)
+        full_batch_size = (batch_size, batch.size(1), batch.size(2), batch.size(3))
+        first_entry = batch[0].unsqueeze(0)
+        repeat_first_entry = first_entry.expand(full_batch_size)
+        padded_batch = torch.cat(
+            [batch, repeat_first_entry[batch.size(0) : batch_size]], dim=0
+        )
+        assert padded_batch.size() == full_batch_size
+        PADDED_TENSOR = padded_batch
+    PADDED_TENSOR[: batch.size(0)].copy_(batch, non_blocking=True)
+    return PADDED_TENSOR
+
+
+def get_features_batch(
+    encoder, input_image_batch, pad_input_image_batch, batch_size, device
+):
+    if pad_input_image_batch:
+        features_batch = encoder(
+            pad_to_batch_size(input_image_batch, batch_size, device)
+        )
+        return features_batch[: input_image_batch.size(0)]
+    return encoder(input_image_batch)
+
+
+def build_results_batch(predictor, batch, batch_size, pad_input_image_batch):
+    encoder = predictor.model.image_encoder
+    device = predictor.device
+
+    input_image_batch = batch[0]
+    # The number of valid data points varies slightly per batch
+    orig_input_image_batch_size = input_image_batch.size(0)
+    if input_image_batch is None:
+        return (None, None, None)
+
+    with torch.autograd.profiler.record_function("data transfer"):
+        coords_lists = unbind_jagged(*([device] + batch[1:4]))
+        gt_masks_lists = unbind_jagged(*([device] + batch[4:7]))
+        if coords_lists is None:
+            return (None, None, None)
+        datapoints = list(zip(*(batch[7:] + [coords_lists, gt_masks_lists])))
+        if pad_input_image_batch:
+            # Pad to a static shape to avoid recompilation
+            input_image_batch = pad_to_batch_size(input_image_batch, batch_size, device)
+        else:
+            input_image_batch = input_image_batch.to(device=device, non_blocking=True)
+
+    # We explicitly exclude data transfers from the timing to focus
+    # only on the kernel performance.
+    # Next we synchronize and set two events to start timing.
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+
+    with torch.autograd.profiler.record_function("timed region"):
+        with torch.autograd.profiler.record_function("image encoder"):
+            features_batch = encoder(input_image_batch)
+            features_batch = features_batch[:orig_input_image_batch_size]
+
+        with torch.autograd.profiler.record_function("predict_torch"):
+            result_batch = []
+            for batch_idx, (
+                anns,
+                image,
+                input_size,
+                idx,
+                coords,
+                gt_masks,
+            ) in enumerate(datapoints):
+                features = features_batch.narrow(0, batch_idx, 1)
+                predictor.reset_image()
+                predictor.original_size = image.shape[:2]
+                predictor.input_size = input_size
+                predictor.features = features
+                predictor.is_image_set = True
+                coords = coords.unsqueeze(1)
+                fg_labels = torch.ones(
+                    (coords.size(0), 1), dtype=torch.int, device=device
+                )
+                masks, scores, logits = predictor.predict_torch(
+                    point_coords=coords,
+                    point_labels=fg_labels,
+                    multimask_output=True,
+                )
+                entry = create_result_entry(anns, gt_masks, masks, scores, idx)
+                result_batch += entry
+
+        # After all kernels have been launched we synchronize again and measure
+        # the amount of time spent on the GPU. This is a fairly tight measurement
+        # around the launched GPU kernels and excludes data movement from host
+        # to device.
+        if torch.cuda.is_available():
+            end_event.record()
+            torch.cuda.synchronize()
+            elapsed_time = start_event.elapsed_time(end_event)
+        else:
+            elapsed_time = time.time() - t0
+    return result_batch, orig_input_image_batch_size, elapsed_time
+
+
+def build_results(
+    batched_data_iter,
+    predictor,
+    mask_debug_out_dir,
+    batch_size,
+    use_compile,
+    use_compile_decoder,
+    pad_input_image_batch,
+    compress,
+    use_fullgraph=False,
+):
+    # TODO: Re-enable this for datapoints
+    assert not use_compile_decoder
+
+    batch_runner = build_results_batch
+
+    results = []
+    batch_idx = 0
+    num_images = 0
+    num_batches = 0
+    elapsed_time = 0
+    partial_batch = False
+    for batch in tqdm.tqdm(batched_data_iter):
+        with torch.no_grad():
+            if batch_idx == 0:
+                with torch.autograd.profiler.record_function("compilation and warmup"):
+                    if str(use_compile) != "False":
+                        predictor.model.image_encoder = torch.compile(
+                            predictor.model.image_encoder,
+                            mode=use_compile,
+                            fullgraph=use_fullgraph,
+                        )
+                    # Run first batch a few times for warmup and exclude it from the final timings
+                    for _ in range(5):
+                        _ = batch_runner(
+                            predictor, batch, batch_size, pad_input_image_batch
+                        )
+            result_batch, num_datapoints, kernel_time = batch_runner(
+                predictor, batch, batch_size, pad_input_image_batch
+            )
+            if result_batch is not None:
+                results += result_batch
+        # We expect a partial batch to only happens once at the end
+        assert not partial_batch
+        # Only measure timing on full batches
+        if num_datapoints == batch_size:
+            num_images += num_datapoints
+            num_batches += 1
+            # We consistently exclude the last (512 - filtered) images
+            # Since batch sizes must be powers of two and less than
+            # or equal 512 this ensures consistent timing across varying
+            # batch sizes.
+            if num_images <= 4488:
+                elapsed_time += kernel_time
+        else:
+            partial_batch = True
+        batch_idx += 1
+
+    avg_ms_per_img = None
+    if num_images > 0:
+        avg_ms_per_img = elapsed_time
+        avg_ms_per_img = avg_ms_per_img / num_images
+
+    return results, avg_ms_per_img, num_batches, num_images
+
+
+def identity_runner(fn, *args, **kwargs):
+    return fn(*args, **kwargs)
+
+
+def profiler_runner(path, fn, *args, **kwargs):
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        result = fn(*args, **kwargs)
+    prof.export_chrome_trace(path)
+    return result
+
+
+def profile_top_runner(fn, *args, **kwargs):
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        record_shapes=True,
+    ) as prof:
+        result = fn(*args, **kwargs)
+    if torch.cuda.is_available():
+        print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+    else:
+        print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
+    return result
+
+
+def memory_runner(path, fn, *args, **kwargs):
+    print("Start memory recording")
+    torch.cuda.synchronize()
+    torch.cuda.memory._record_memory_history(
+        True, trace_alloc_max_entries=100000, trace_alloc_record_context=True
+    )
+    result = fn(*args, **kwargs)
+    torch.cuda.synchronize()
+    snapshot = torch.cuda.memory._snapshot()
+    print("Finish memory recording")
+    import pickle
+
+    with open(path, "wb") as f:
+        pickle.dump(snapshot, f)
+    # Use to convert pickle file into html
+    # python torch/cuda/_memory_viz.py trace_plot <snapshot>.pickle -o <snapshot>.html
+    return result
+
+
+def run(
+    coco_root_dir,
+    coco_slice_name,
+    sam_checkpoint_base_path,
+    sam_model_type,
+    point_sampling_cache_dir,
+    mask_debug_out_dir,
+    batch_size=1,
+    print_header=False,
+    coco_category_names=None,
+    limit=None,
+    img_id=None,
+    use_half=None,
+    use_compile="False",
+    use_compile_decoder=False,
+    compress=None,
+    min_sqnr=None,
+    num_workers=0,
+    use_rel_pos=True,
+    pad_input_image_batch=True,
+    profile_path=None,
+    profile_top=False,
+    memory_path=None,
+    device="cuda",
+    output_json_path=None,
+    output_json_local=False,
+):
+    from torch._inductor import config as inductorconfig
+
+    inductorconfig.triton.unique_kernel_names = True
+    inductorconfig.epilogue_fusion = True
+    inductorconfig.coordinate_descent_tuning = True
+    inductorconfig.coordinate_descent_check_all_directions = True
+    inductorconfig.force_fuse_int_mm_with_mul = True
+    inductorconfig.use_mixed_mm = True
+    from torch.sparse import SparseSemiStructuredTensor
+
+    SparseSemiStructuredTensor._FORCE_CUTLASS = False
+
+    if use_half is not None:
+        if use_half == "float16":
+            use_half = torch.float16
+        elif use_half == "bfloat16":
+            use_half = torch.bfloat16
+        else:
+            raise ValueError(
+                "Expected one of float16 or bfloat for specified {use_half}"
+            )
+
+    # Batch size needs to be a multiple of two and at most 512.
+    assert math.log2(batch_size).is_integer()
+    assert batch_size <= 512
+
+    # https://github.com/facebookresearch/segment-anything/tree/main#model-checkpoints
+    # largest to smallest: vit_h, vit_l, vit_b
+    model_type_to_checkpoint = {
+        "vit_h": f"{sam_checkpoint_base_path}/sam_vit_h_4b8939.pth",
+        "vit_l": f"{sam_checkpoint_base_path}/sam_vit_l_0b3195.pth",
+        "vit_b": f"{sam_checkpoint_base_path}/sam_vit_b_01ec64.pth",
+    }
+
+    from segment_anything_fast import SamPredictor, sam_model_registry
+
+    checkpoint_path = model_type_to_checkpoint[sam_model_type]
+    sam = sam_model_registry[sam_model_type](checkpoint=checkpoint_path).to(
+        torch.device(device)
+    )
+    predictor = SamPredictor(sam)
+
+    from segment_anything_fast import tools
+
+    tools.apply_eval_dtype_predictor(predictor, use_half)
+
+    for block in predictor.model.image_encoder.blocks:
+        block.attn.use_rel_pos = use_rel_pos
+
+    # Helper filter functions
+    def attn_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and "attn" in name
+
+    def mlp_lin1_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and "lin1" in name
+
+    def mlp_lin2_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and "lin2" in name
+
+    def mlp_only(mod, name):
+        return isinstance(mod, torch.nn.Linear) and "mlp" in name
+
+    if compress == "int8_dynamic_quant":
+        quantize_(predictor.model.image_encoder, int8_dynamic_activation_int8_weight())
+        if not TORCH_VERSION_AT_LEAST_2_5:
+            predictor.model.image_encoder = unwrap_tensor_subclass(
+                predictor.model.image_encoder
+            )
+    elif compress == "sparse_mlp_only":
+
+        def mlp_only(mod, name):
+            return isinstance(mod, torch.nn.Linear) and "mlp" in name
+
+        apply_fake_sparsity(predictor.model.image_encoder, filter_fn=mlp_only)
+        sparsify_(
+            predictor.model.image_encoder, semi_sparse_weight(), filter_fn=mlp_only
+        )
+    elif compress == "sparse":
+        apply_fake_sparsity(predictor.model.image_encoder)
+        sparsify_(predictor.model.image_encoder, semi_sparse_weight())
+    elif compress == "int8_dynamic_quant_sparse":
+        # apply sparsify first to set qparams
+        apply_fake_sparsity(predictor.model.image_encoder, filter_fn=mlp_only)
+
+        quantize_(
+            predictor.model.image_encoder,
+            int8_dynamic_activation_int8_weight(),
+            attn_only,
+        )
+        quantize_(
+            predictor.model.image_encoder,
+            int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()),
+            mlp_lin1_only,
+        )
+        sparsify_(predictor.model.image_encoder, semi_sparse_weight(), mlp_lin2_only)
+        if not TORCH_VERSION_AT_LEAST_2_5:
+            predictor.model.image_encoder = unwrap_tensor_subclass(
+                predictor.model.image_encoder
+            )
+    elif compress == "int4_weight_only_sparse":
+        # apply sparsify first to set qparams
+        apply_fake_sparsity(predictor.model.image_encoder, filter_fn=mlp_only)
+        from torchao.dtypes import MarlinSparseLayout
+
+        quantize_(
+            predictor.model.image_encoder,
+            int8_dynamic_activation_int8_weight(),
+            attn_only,
+        )
+        quantize_(
+            predictor.model.image_encoder,
+            int4_weight_only(layout=MarlinSparseLayout()),
+            mlp_lin1_only,
+        )
+        sparsify_(predictor.model.image_encoder, semi_sparse_weight(), mlp_lin2_only)
+        if not TORCH_VERSION_AT_LEAST_2_5:
+            predictor.model.image_encoder = unwrap_tensor_subclass(
+                predictor.model.image_encoder
+            )
+
+    elif compress is not None and "autoquant_v2" in compress:
+        example_input = torch.randn(
+            1, 3, 1024, 1024, dtype=torch.bfloat16, device=device
+        )
+        if "autoquant_v2-int4" == compress:
+            autoquant_v2(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                qtensor_class_list=torchao.prototype.quantization.autoquant_v2.DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
+            )
+        elif "autoquant_v2-float8" == compress:
+            autoquant_v2(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                qtensor_class_list=torchao.prototype.quantization.autoquant_v2.OTHER_AUTOQUANT_CLASS_LIST,
+            )
+        elif "autoquant_v2-all" == compress:
+            autoquant_v2(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                qtensor_class_list=torchao.prototype.quantization.autoquant_v2.ALL_AUTOQUANT_CLASS_LIST,
+            )
+        else:
+            autoquant_v2(
+                predictor.model.image_encoder, example_input=example_input, manual=True
+            )
+
+        predictor.model.image_encoder(example_input)
+        predictor.model.image_encoder.finalize_autoquant()
+
+    elif compress is not None and "autoquant" in compress:
+        example_input = torch.randn(
+            1, 3, 1024, 1024, dtype=torch.bfloat16, device=device
+        )
+        if "autoquant-int4" == compress:
+            autoquant(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                qtensor_class_list=torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
+            )
+        elif "autoquant-float8" == compress:
+            autoquant(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                qtensor_class_list=torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
+            )
+        elif "autoquant-sparse" == compress:
+            autoquant(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                qtensor_class_list=torchao.quantization.DEFAULT_SPARSE_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
+            )
+        elif "autoquant-all" == compress:
+            autoquant(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                qtensor_class_list=torchao.quantization.ALL_AUTOQUANT_CLASS_LIST,
+                min_sqnr=min_sqnr,
+            )
+        else:
+            autoquant(
+                predictor.model.image_encoder,
+                example_input=example_input,
+                manual=True,
+                min_sqnr=min_sqnr,
+            )
+        predictor.model.image_encoder(example_input)
+        predictor.model.image_encoder.finalize_autoquant()
+    else:
+        assert compress is None, f"Unsupported compress mode {compress}"
+
+    coco_img_ids_, cat_id_to_cat, catIds, coco = setup_coco_img_ids(
+        coco_root_dir, coco_slice_name, coco_category_names, img_id
+    )
+
+    coco_img_ids = []
+    for imgId in coco_img_ids_:
+        img = coco.loadImgs(imgId)[0]
+        annIds = coco.getAnnIds(imgIds=img["id"], catIds=catIds, iscrowd=None)
+        anns = coco.loadAnns(annIds)
+        if len(anns) != 0:
+            coco_img_ids.append(imgId)
+
+    build_batch = build_data(
+        coco_img_ids,
+        coco,
+        catIds,
+        coco_root_dir,
+        coco_slice_name,
+        point_sampling_cache_dir,
+        predictor,
+        use_half,
+        pad_input_image_batch,
+    )
+
+    limit = len(coco_img_ids) if limit is None else limit
+    batched_data_iter = torch.utils.data.DataLoader(
+        list(range(limit)),
+        batch_size=batch_size,
+        collate_fn=build_batch,
+        num_workers=num_workers,
+        pin_memory=False,
+    )
+    runner = identity_runner
+
+    if profile_path is not None:
+        import functools
+
+        runner = functools.partial(profiler_runner, profile_path)
+
+    if profile_top:
+        runner = profile_top_runner
+
+    if memory_path is not None:
+        assert (
+            use_compile != "max-autotune"
+        ), f"Memory path does not support {use_compile}"
+        import functools
+
+        runner = functools.partial(memory_runner, memory_path)
+
+    results, avg_ms_per_img, num_batches, num_images = runner(
+        build_results,
+        batched_data_iter,
+        predictor,
+        mask_debug_out_dir,
+        batch_size,
+        use_compile,
+        use_compile_decoder,
+        pad_input_image_batch,
+        compress,
+    )
+
+    results = [[r[0], r[1], r[2], r[3].item()] for r in results]
+
+    img_s, batch_ms_batch_size = None, None
+    if avg_ms_per_img is not None:
+        img_s = 1000 / avg_ms_per_img
+        batch_ms_batch_size = (avg_ms_per_img * num_images) / num_batches / batch_size
+
+    mIoU = calculate_miou(results, mask_debug_out_dir, True, cat_id_to_cat)
+    if torch.cuda.is_available():
+        max_memory_allocated_bytes = torch.cuda.max_memory_allocated()
+        _, total_memory = torch.cuda.mem_get_info()
+        max_memory_allocated_percentage = int(
+            100 * (max_memory_allocated_bytes / total_memory)
+        )
+        max_memory_allocated_bytes = max_memory_allocated_bytes >> 20
+    else:
+        import psutil
+
+        total_memory = psutil.virtual_memory().total
+        max_memory_allocated_bytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        max_memory_allocated_percentage = int(
+            100 * (max_memory_allocated_bytes / (total_memory >> 10))
+        )
+        max_memory_allocated_bytes = max_memory_allocated_bytes >> 10
+
+    with open("results.csv", "a") as f:
+        if print_header:
+            header = ",".join(
+                [
+                    "device",
+                    "sam_model_type",
+                    "batch_size",
+                    "memory(MiB)",
+                    "memory(%)",
+                    "img_s(avg)",
+                    "batch_ms(avg)/batch_size",
+                    "mIoU",
+                    "use_compile",
+                    "use_half",
+                    "compress",
+                    "use_compile_decoder",
+                    "use_rel_pos",
+                    "pad_input_image_batch",
+                    "num_workers",
+                    "num_batches",
+                    "num_images",
+                    "profile_path",
+                    "memory_path",
+                ]
+            )
+            f.write(header + "\n")
+        vals = ",".join(
+            map(
+                str,
+                [
+                    device,
+                    sam_model_type,
+                    batch_size,
+                    max_memory_allocated_bytes,
+                    max_memory_allocated_percentage,
+                    img_s,
+                    batch_ms_batch_size,
+                    mIoU,
+                    use_compile,
+                    use_half,
+                    compress,
+                    use_compile_decoder,
+                    use_rel_pos,
+                    pad_input_image_batch,
+                    num_workers,
+                    num_batches,
+                    num_images,
+                    profile_path,
+                    memory_path,
+                ],
+            )
+        )
+        f.write(vals + "\n")
+
+    if output_json_path:
+        headers = [
+            "name",
+            "dtype",
+            "min_sqnr",
+            "compile",
+            "device",
+            "arch",
+            "metric",
+            "actual",
+            "target",
+        ]
+        name = sam_model_type
+        arch = get_arch_name()
+        dtype = compress or "noquant"
+        # boolean flag to indicate whether compile is used
+        compile = use_compile != "False"
+        memory_result = [
+            name,
+            dtype,
+            min_sqnr,
+            compile,
+            device,
+            arch,
+            "memory(MiB)",
+            max_memory_allocated_bytes,
+            None,
+        ]
+        performance_result = [
+            name,
+            dtype,
+            min_sqnr,
+            compile,
+            device,
+            arch,
+            "img_s(avg)",
+            img_s,
+            None,
+        ]
+        write_json_result = (
+            write_json_result_local if output_json_local else write_json_result_ossci
+        )
+        write_json_result(output_json_path, headers, memory_result)
+        write_json_result(output_json_path, headers, performance_result)
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/benchmarks/models/sam/flash_4_configs.p b/benchmarks/models/sam/flash_4_configs.p
new file mode 100644
index 0000000000000000000000000000000000000000..4b6e234d0d8dc9753a04c57ae001b4cedd232e1f
GIT binary patch
literal 219
zcmZo*nR<Z%0&1sd^stuX7bRy*>0t@ZtV*3Sd5VU&g13M#gMfEK$&}V9K~plAsUxD{
i#lX<O03<jVyc@h3OQujm0B8!(Fb8iAZ<dlN#d-iNMLSCX

literal 0
HcmV?d00001

diff --git a/benchmarks/models/sam/metrics.py b/benchmarks/models/sam/metrics.py
new file mode 100644
index 0000000000..6a20a9f036
--- /dev/null
+++ b/benchmarks/models/sam/metrics.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import pandas as pd
+import torch
+
+
+def create_result_entry(anns, gt_masks_list, masks, scores, img_idx):
+    argmax_scores = torch.argmax(scores, dim=1)
+    inference_masks = masks.gather(
+        1,
+        argmax_scores.unsqueeze(-1)
+        .unsqueeze(-1)
+        .unsqueeze(-1)
+        .expand((masks.size(0), 1, masks.size(2), masks.size(3))),
+    ).squeeze(1)
+
+    def _iou(mask1, mask2):
+        assert mask1.dim() == 3
+        assert mask2.dim() == 3
+        intersection = torch.logical_and(mask1, mask2)
+        union = torch.logical_or(mask1, mask2)
+        return intersection.sum(dim=(-1, -2)) / union.sum(dim=(-1, -2))
+
+    top_score_ious = _iou(inference_masks, gt_masks_list)
+
+    entry = []
+    for idx in range(top_score_ious.size(0)):
+        entry.append(
+            [img_idx, anns[idx]["id"], anns[idx]["category_id"], top_score_ious[idx]]
+        )
+    return entry
+
+
+def calculate_miou(results, mask_debug_out_dir, silent, cat_id_to_cat):
+    df = pd.DataFrame(results, columns=["img_id", "ann_id", "cat_id", "iou"])
+    df.to_csv(f"{mask_debug_out_dir}/df.csv")
+    df["supercategory"] = df["cat_id"].map(
+        lambda cat_id: cat_id_to_cat[cat_id]["supercategory"]
+    )
+    df["category"] = df["cat_id"].map(lambda cat_id: cat_id_to_cat[cat_id]["name"])
+
+    # TODO: cross reference the specifics of how we calculate mIoU with
+    # the SAM folks (should it be per dataset, per category, per image, etc)
+    # currently, just calculate them all
+
+    # TODO: QOL save the summaries to file
+
+    # per category
+    per_category = pd.pivot_table(
+        df,
+        values="iou",
+        index=["cat_id", "supercategory", "category"],
+        aggfunc=("mean", "count"),
+    )
+    if not silent:
+        print("\nmIoU averaged per category")
+        print(per_category)
+
+    # per super-category
+    per_supercategory = pd.pivot_table(
+        df, values="iou", index=["supercategory"], aggfunc=("mean", "count")
+    )
+    if not silent:
+        print("\nmIoU averaged per supercategory")
+        print(per_supercategory)
+
+    # per all selected masks
+    per_all_masks_agg = df["iou"].agg(["mean", "count"])
+    if not silent:
+        print("\nmIoU averaged per all selected masks")
+        print(per_all_masks_agg)
+
+    return df["iou"].agg(["mean", "count"])["mean"]
diff --git a/benchmarks/models/sam/results.csv b/benchmarks/models/sam/results.csv
new file mode 100644
index 0000000000..5ae8a6dbd0
--- /dev/null
+++ b/benchmarks/models/sam/results.csv
@@ -0,0 +1,7 @@
+device,sam_model_type,batch_size,memory(MiB),memory(%),img_s(avg),batch_ms(avg)/batch_size,mIoU,use_compile,use_half,compress,use_compile_decoder,use_rel_pos,pad_input_image_batch,num_workers,num_batches,num_images,profile_path,memory_path
+cuda,vit_h,32,15172,18,22.533401716616083,44.37856354651513,0.5812715827356921,max-autotune,torch.bfloat16,None,False,True,True,32,154,4928,None,None
+cuda,vit_h,32,15154,18,25.16516896830006,39.73746416166231,0.5818834536577897,max-autotune,torch.bfloat16,int8_dynamic_quant,False,True,True,32,154,4928,None,None
+cuda,vit_h,32,15632,19,24.824717871078573,40.282431614863405,0.5675837487618974,max-autotune,torch.bfloat16,sparse_mlp_only,False,True,True,32,154,4928,None,None
+cuda,vit_h,32,13429,16,24.589577947798148,40.66763578142439,0.5306639662569573,max-autotune,torch.bfloat16,sparse,False,True,True,32,154,4928,None,None
+cuda,vit_h,32,14869,18,26.597207143088742,37.597932543073384,0.5669944616184625,max-autotune,torch.bfloat16,int8_dynamic_quant_sparse,False,True,True,32,154,4928,None,None
+cuda,vit_h,32,17068,21,23.96093702681232,41.73459489004953,0.5485481164943489,max-autotune,torch.float16,int4_weight_only_sparse,False,True,True,32,154,4928,None,None
diff --git a/benchmarks/models/utils.py b/benchmarks/models/utils.py
new file mode 100644
index 0000000000..2b46f06aa7
--- /dev/null
+++ b/benchmarks/models/utils.py
@@ -0,0 +1,111 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import hashlib
+import json
+import os
+import platform
+import time
+
+import torch
+
+
+def get_arch_name() -> str:
+    if torch.cuda.is_available():
+        return torch.cuda.get_device_name()
+    else:
+        # This returns x86_64 or arm64 (for aarch64)
+        return platform.machine()
+
+
+def write_json_result_ossci(output_json_path, headers, row):
+    """
+    Write the result into JSON format, so that it can be uploaded to the benchmark database
+    to be displayed on OSS dashboard. The JSON format is defined at
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+
+    OSS CI version, that will leave many fields to be filled in by CI
+    """
+    mapping_headers = {headers[i]: v for i, v in enumerate(row)}
+    record = {
+        "benchmark": {
+            "name": "TorchAO benchmark",
+            "mode": "inference",
+            "dtype": mapping_headers["dtype"],
+            "extra_info": {
+                "device": mapping_headers["device"],
+                "arch": mapping_headers["arch"],
+                "min_sqnr": mapping_headers["min_sqnr"],
+                # True means compile is enabled, False means eager mode
+                "compile": mapping_headers["compile"],
+            },
+        },
+        "model": {
+            "name": mapping_headers["name"],
+            "type": "model",
+            "origins": ["torchao"],
+        },
+        "metric": {
+            "name": mapping_headers["metric"],
+            "benchmark_values": [mapping_headers["actual"]],
+            "target_value": mapping_headers["target"],
+        },
+    }
+
+    with open(f"{os.path.splitext(output_json_path)[0]}.json", "a") as f:
+        print(json.dumps(record), file=f)
+
+
+def write_json_result_local(output_json_path, headers, row):
+    """
+    Write the result into JSON format, so that it can be uploaded to the benchmark database
+    to be displayed on OSS dashboard. The JSON format is defined at
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+
+    Local version (filling in dummy values for fields that should be populated by CI)
+    """
+    mapping_headers = {headers[i]: v for i, v in enumerate(row)}
+    today = datetime.date.today()
+    sha_hash = hashlib.sha256(str(today).encode("utf-8")).hexdigest()
+    first_second = datetime.datetime.combine(today, datetime.time.min)
+    workflow_id = int(first_second.timestamp())
+    job_id = workflow_id + 1
+    record = {
+        "timestamp": int(time.time()),
+        "schema_version": "v3",
+        "name": "devvm local benchmark",
+        "repo": "pytorch/ao",
+        "head_branch": "main",
+        "head_sha": sha_hash,
+        "workflow_id": workflow_id,
+        "run_attempt": 1,
+        "job_id": job_id,
+        "benchmark": {
+            "name": "TorchAO benchmark",
+            "mode": "inference",
+            "dtype": mapping_headers["dtype"],
+            "extra_info": {
+                "device": mapping_headers["device"],
+                "arch": mapping_headers["arch"],
+                "min_sqnr": mapping_headers["min_sqnr"],
+                # True means compile is enabled, False means eager mode
+                "compile": mapping_headers["compile"],
+            },
+        },
+        "model": {
+            "name": mapping_headers["name"],
+            "type": "model",
+            "origins": ["torchao"],
+        },
+        "metric": {
+            "name": mapping_headers["metric"],
+            "benchmark_values": [mapping_headers["actual"]],
+            "target_value": mapping_headers["target"],
+        },
+    }
+
+    with open(f"{os.path.splitext(output_json_path)[0]}.json", "a") as f:
+        print(json.dumps(record), file=f)