pytorch · jainapurva · Apr 4, 2025
diff --git a/benchmarks/models/llama/README.md b/benchmarks/models/llama/README.md
@@ -0,0 +1,33 @@
+# Llama Benchmarks
+
+The llama folder contains code/scripts for stable benchmarking llama models.
+
+To get model weights, go to https://huggingface.co/meta-llama/Llama-2-7b, https://huggingface.co/meta-llama/Meta-Llama-3-8B, https://huggingface.co/meta-llama/Meta-Llama-3.1-8B
+and follow the steps to gain access.
+
+Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to
+download and convert the model weights
+
+once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation
+directly using `generate.py` or `eval.py`.
+
+## KV Cache Quantization - Memory Efficient Inference
+We've added some features to `model.py` compared to the original gpt-fast implementation in order to enable long context length (and necessarily memory efficient) inference. Specifically we've added kv_cache quantization and a linear_causal_mask implementation which are **able to reduce memory usage by 50-60%** at long context lengths.
+
+In practice these features alongside int4 weight only quantization allow us to do Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.**
+
+You can check it out yourself with `generate.py`, these features exist as a proof of concept and technical demonstration of the techniques though we're working to figure out a way to release them in a general way. Until then feel free to copy these features into your own models. The details and a full explanation can be found in this [PR](https://github.com/pytorch/ao/pull/738)
+
+To see how these techniques scale generally we've run `generate.py` with subsets of these features for different context lengths on an A100 GPU. You can find commands to reproduce these numbers in `benchmarks.sh`
+
+| context length (tokens) | normal peak (GB) | kv_quant peak (GB) | kv quant+linear_causal_mask peak (GB) |
+|-------------------------|------------------|--------------------|---------------------------------------|
+|                    8192 |            17.86 |              17.52 |                                 17.47 |
+|                   16384 |            19.81 |              18.75 |                                 18.48 |
+|                   32768 |            23.83 |              21.72 |                                 20.64 |
+|                   65536 |             33.5 |              29.54 |                                 25.24 |
+|                  131072 |            59.27 |              52.62 |                                 34.18 |
+
+## Adding Benchmarks For New Techniques
+
+If you want to add benchmarks that you think should be kept up to date, please try to keep the format consistent. For performance focused techniques (e.g. if they require fine-tuning or something else) add an option to run them in generate.py and an execution command in benchmarks.sh in the relevant section. If its a technique that's still in development, add it in the section for `OTHER BENCHMARKS` if there's a finalized api and you want those numbers in the main quantization README, add them in the `README BENCHMARKS` section. For accuracy focused techniques, add them in eval.py and evaluations.sh in a similar vein. Ideally techniques in the main readme will have both benchmarks and evaluations set up here so they can be monitored and reproduced easily.
diff --git a/benchmarks/models/llama/_eval.py b/benchmarks/models/llama/_eval.py
@@ -0,0 +1,362 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import lm_eval
+import torch
+import torch.nn.functional as F
+
+from torchao.quantization.GPTQ_MT import MultiTensor
+from torchao.quantization.utils import _MultiInput
+
+try:  # lm_eval version 0.4
+    from lm_eval.evaluator import evaluate  # pyre-ignore[21]
+    from lm_eval.models.huggingface import HFLM as eval_wrapper  # pyre-ignore[21]
+    from lm_eval.tasks import get_task_dict  # pyre-ignore[21]
+except:  # lm_eval version 0.3
+    from lm_eval import base, evaluator, tasks
+
+    eval_wrapper = base.BaseLM
+    get_task_dict = tasks.get_task_dict
+    evaluate = evaluator.evaluate
+
+
+class MultiTensorInputRecorder(eval_wrapper):
+    def __init__(
+        self,
+        tokenizer,
+        calibration_seq_length,
+        input_prep_func=None,
+        pad_calibration_inputs=False,
+        vocab_size=32000,
+        pad_token=0,
+        device="cpu",
+    ):
+        try:
+            super().__init__()
+        except TypeError:
+            # lm_eval 0.4.2 removed the default init
+            super().__init__("gpt2", device="cpu")
+
+        self.tokenizer = tokenizer
+        self._device = torch.device(device)
+        self.vocab_size = vocab_size
+        self._max_seq_length = calibration_seq_length
+        self.calibration_seq_length = calibration_seq_length
+
+        self.input_prep_func = (
+            input_prep_func if input_prep_func is not None else lambda x: (x,)
+        )
+
+        self.pad_calibration_inputs = pad_calibration_inputs
+        self.pad_token = pad_token
+
+        # Initialize inputs as a list of two empty lists for input tensors and indices
+        self.inputs = [[], []]
+
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id()
+        except:
+            return self.tokenizer.eos_id
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def max_gen_toks(self):
+        return 50
+
+    @property
+    def batch_size(self):
+        return 1
+
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str, **kwargs):
+        tokens = self.tokenizer.encode(string)
+        if hasattr(self.tokenizer, "bos_id"):
+            try:
+                tokens = [self.tokenizer.bos_id()] + tokens
+            except:
+                tokens = [self.tokenizer.bos_id] + tokens
+        return tokens
+
+    def tok_decode(self, tokens):
+        decoded = self.tokenizer.decode(tokens)
+        return decoded
+
+    def add_input(self, args):
+        # Ensure that inputs are added correctly as pairs
+        self.inputs[0].append(args[0])
+        self.inputs[1].append(args[1])
+
+    def record_inputs(self, calibration_tasks, calibration_limit):
+        try:
+            lm_eval.tasks.initialize_tasks()
+        except:
+            pass
+
+        task_dict = get_task_dict(calibration_tasks)
+        print("Obtaining GPTQ calibration inputs on: ", calibration_tasks)
+
+        evaluate(
+            self,
+            task_dict,
+            limit=calibration_limit,
+        )
+        return self
+
+    def get_inputs(self):
+        # Return MultiTensor instances for both inputs and indices
+        return [MultiTensor(self.inputs[0]), MultiTensor(self.inputs[1])]
+
+    def _model_call(self, inps):
+        inps = inps.squeeze(0)
+        T = len(inps)
+        if (
+            # Can't use inputs that are too short when padding is disabled
+            (T < self.calibration_seq_length and not self.pad_calibration_inputs)
+            or
+            # Can't use inputs that actually use the token we use for padding
+            (self.pad_calibration_inputs and self.pad_token in inps)
+        ):
+            # Give random output
+            return torch.randn(
+                (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+            )
+
+        # Pad or truncate to the correct size
+        if T >= self.calibration_seq_length:
+            inps = inps[: self.calibration_seq_length]
+        else:
+            inps = F.pad(
+                inps, (0, self.calibration_seq_length - T), value=self.pad_token
+            )
+
+        inps = inps.unsqueeze(0)
+        model_in = self.input_prep_func(inps)
+
+        self.add_input(model_in)
+
+        # Output `something` with the correct shape to keep eval going
+        return torch.randn(
+            (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+        )
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+
+class InputRecorder(eval_wrapper):
+    """
+    This is a fake evaluation wrapper from the lm_eval library that just records the inputs
+    so that they can be used in calibration.
+
+    If pad_calibration_inputs is enabled, the input recorder will take
+    each input and pad/truncate it down to the calibration_seq_length.
+    (if using padding you should set the embeddings for the pad_token to 0
+    in the model)
+
+    Note: after padding/truncation, input_prep_function is called to bring
+    it to the proper form to be inserted into a given model.
+
+    If not, it will only truncate inputs to the desired length.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        calibration_seq_length,
+        input_prep_func=None,
+        pad_calibration_inputs=False,
+        vocab_size=32000,
+        pad_token=0,
+        device="cpu",
+    ):
+        try:
+            super().__init__()
+        except TypeError:
+            # lm_eval 0.4.2 removed the default init
+            super().__init__("gpt2", device="cpu")
+
+        self.tokenizer = tokenizer
+        self._device = torch.device(device)
+        self.vocab_size = vocab_size
+        self._max_seq_length = calibration_seq_length
+        self.calibration_seq_length = calibration_seq_length
+
+        # need to take inps and convert to corrent input
+        # for model
+        self.input_prep_func = (
+            input_prep_func if input_prep_func is not None else lambda x: (x,)
+        )
+
+        self.pad_calibration_inputs = pad_calibration_inputs
+        self.pad_token = pad_token
+
+        self.inputs = None
+
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id()
+        except:
+            return self.tokenizer.eos_id
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def max_gen_toks(self):
+        return 50
+
+    @property
+    def batch_size(self):
+        return 1
+
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str, **kwargs):
+        # TODO: verify this for multi-batch as well
+        tokens = self.tokenizer.encode(string)
+        if hasattr(self.tokenizer, "bos_id"):
+            try:
+                tokens = [self.tokenizer.bos_id()] + tokens
+            except:
+                tokens = [self.tokenizer.bos_id] + tokens
+        return tokens
+
+    def tok_decode(self, tokens):
+        decoded = self.tokenizer.decode(tokens)
+        return decoded
+
+    def add_input(self, args):
+        if self.inputs is None:
+            self.inputs = [_MultiInput([arg]) for arg in args]
+        else:
+            self.inputs = [
+                multi.add_input(arg) for (multi, arg) in zip(self.inputs, args)
+            ]
+
+    def record_inputs(
+        self,
+        calibration_tasks,
+        calibration_limit,
+    ):
+        try:
+            lm_eval.tasks.initialize_tasks()
+        except:
+            pass
+
+        task_dict = get_task_dict(calibration_tasks)
+        print("Obtaining GPTQ calibration inputs on: ", calibration_tasks)
+
+        evaluate(
+            self,
+            task_dict,
+            limit=calibration_limit,
+        )
+        return self
+
+    def get_inputs(self):
+        return self.inputs
+
+    def _model_call(self, inps):
+        inps = inps.squeeze(0)
+        T = len(inps)
+        if (
+            # can't use inputs that are too short when padding disabled
+            (T < self.calibration_seq_length and not self.pad_calibration_inputs)
+            or
+            # can't use inputs that actually use token we use for padding
+            (self.pad_calibration_inputs and self.pad_token in inps)
+        ):
+            # give random output
+            return torch.randn(
+                (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+            )
+
+        # pad or truncate to the right size
+        if T >= self.calibration_seq_length:
+            inps = inps[: self.calibration_seq_length]
+        else:
+            inps = F.pad(inps, (self.pad_token, self.calibration_seq_length - T))
+
+        inps = inps.unsqueeze(0)
+        model_in = self.input_prep_func(inps)
+
+        self.add_input(model_in)
+
+        # output `something` with correct shape to keep eval going
+        return torch.randn(
+            (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+        )
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+
+class TransformerEvalWrapper(InputRecorder):
+    """
+    A wrapper class for GPTFast, providing integration with the lm-evaluation-harness library.
+    """
+
+    def __init__(
+        self, model, tokenizer, max_seq_length, input_prep_func=None, device="cuda"
+    ):
+        super().__init__(tokenizer, None)
+        self._model = model
+        # self.tokenizer = tokenizer
+        self._device = torch.device(device)
+        self._max_seq_length = max_seq_length
+
+        # need to take inps and convert to corrent input
+        # for model
+        self.input_prep_func = (
+            input_prep_func if input_prep_func is not None else lambda x: (x,)
+        )
+
+    def _model_call(self, inps):
+        # TODO: make batches work
+        input = self.input_prep_func(inps)
+
+        max_seq_length = min(max(inps.size()), self.max_length)
+        with torch.device(self._device):
+            self._model.setup_caches(self.batch_size, max_seq_length)
+        logits = self._model(*input)
+        return logits
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+    def run_eval(self, tasks, limit):
+        try:
+            lm_eval.tasks.initialize_tasks()
+        except:
+            pass
+
+        task_dict = get_task_dict(tasks)
+        print("Evaluating Model On: ", task_dict)
+        with torch.no_grad():
+            result = evaluate(
+                self,
+                task_dict,
+                limit=limit,
+            )
+        for task, res in result["results"].items():
+            print(f"{task}: {res}")
+        return result