From 381ef76d6e2f0f8421f3a5e6ee769a14cb6249cc Mon Sep 17 00:00:00 2001 From: Apurva Jain Date: Thu, 3 Apr 2025 23:06:51 -0700 Subject: [PATCH] Move model benchmarking --- benchmarks/models/llama/README.md | 33 + benchmarks/models/llama/_eval.py | 362 +++++ benchmarks/models/llama/benchmark_results.txt | 52 + benchmarks/models/llama/benchmarks.sh | 118 ++ benchmarks/models/llama/bsr_bench_results.txt | 27 + benchmarks/models/llama/bsr_benchmarks.sh | 17 + benchmarks/models/llama/demo_summarize.sh | 13 + benchmarks/models/llama/eval.py | 355 +++++ benchmarks/models/llama/evals.sh | 28 + benchmarks/models/llama/generate.py | 1255 +++++++++++++++++ benchmarks/models/sam/benchmark.sh | 17 + benchmarks/models/sam/eval_combo.py | 692 +++++++++ benchmarks/models/sam/flash_4_configs.p | Bin 0 -> 219 bytes benchmarks/models/sam/metrics.py | 76 + benchmarks/models/sam/results.csv | 7 + benchmarks/models/utils.py | 111 ++ 16 files changed, 3163 insertions(+) create mode 100644 benchmarks/models/llama/README.md create mode 100644 benchmarks/models/llama/_eval.py create mode 100644 benchmarks/models/llama/benchmark_results.txt create mode 100644 benchmarks/models/llama/benchmarks.sh create mode 100644 benchmarks/models/llama/bsr_bench_results.txt create mode 100644 benchmarks/models/llama/bsr_benchmarks.sh create mode 100644 benchmarks/models/llama/demo_summarize.sh create mode 100644 benchmarks/models/llama/eval.py create mode 100644 benchmarks/models/llama/evals.sh create mode 100644 benchmarks/models/llama/generate.py create mode 100755 benchmarks/models/sam/benchmark.sh create mode 100644 benchmarks/models/sam/eval_combo.py create mode 100644 benchmarks/models/sam/flash_4_configs.p create mode 100644 benchmarks/models/sam/metrics.py create mode 100644 benchmarks/models/sam/results.csv create mode 100644 benchmarks/models/utils.py diff --git a/benchmarks/models/llama/README.md b/benchmarks/models/llama/README.md new file mode 100644 index 0000000000..99f1919fc9 --- /dev/null +++ b/benchmarks/models/llama/README.md @@ -0,0 +1,33 @@ +# Llama Benchmarks + +The llama folder contains code/scripts for stable benchmarking llama models. + +To get model weights, go to https://huggingface.co/meta-llama/Llama-2-7b, https://huggingface.co/meta-llama/Meta-Llama-3-8B, https://huggingface.co/meta-llama/Meta-Llama-3.1-8B +and follow the steps to gain access. + +Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to +download and convert the model weights + +once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation +directly using `generate.py` or `eval.py`. + +## KV Cache Quantization - Memory Efficient Inference +We've added some features to `model.py` compared to the original gpt-fast implementation in order to enable long context length (and necessarily memory efficient) inference. Specifically we've added kv_cache quantization and a linear_causal_mask implementation which are **able to reduce memory usage by 50-60%** at long context lengths. + +In practice these features alongside int4 weight only quantization allow us to do Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** + +You can check it out yourself with `generate.py`, these features exist as a proof of concept and technical demonstration of the techniques though we're working to figure out a way to release them in a general way. Until then feel free to copy these features into your own models. The details and a full explanation can be found in this [PR](https://github.com/pytorch/ao/pull/738) + +To see how these techniques scale generally we've run `generate.py` with subsets of these features for different context lengths on an A100 GPU. You can find commands to reproduce these numbers in `benchmarks.sh` + +| context length (tokens) | normal peak (GB) | kv_quant peak (GB) | kv quant+linear_causal_mask peak (GB) | +|-------------------------|------------------|--------------------|---------------------------------------| +| 8192 | 17.86 | 17.52 | 17.47 | +| 16384 | 19.81 | 18.75 | 18.48 | +| 32768 | 23.83 | 21.72 | 20.64 | +| 65536 | 33.5 | 29.54 | 25.24 | +| 131072 | 59.27 | 52.62 | 34.18 | + +## Adding Benchmarks For New Techniques + +If you want to add benchmarks that you think should be kept up to date, please try to keep the format consistent. For performance focused techniques (e.g. if they require fine-tuning or something else) add an option to run them in generate.py and an execution command in benchmarks.sh in the relevant section. If its a technique that's still in development, add it in the section for `OTHER BENCHMARKS` if there's a finalized api and you want those numbers in the main quantization README, add them in the `README BENCHMARKS` section. For accuracy focused techniques, add them in eval.py and evaluations.sh in a similar vein. Ideally techniques in the main readme will have both benchmarks and evaluations set up here so they can be monitored and reproduced easily. diff --git a/benchmarks/models/llama/_eval.py b/benchmarks/models/llama/_eval.py new file mode 100644 index 0000000000..9f429278e3 --- /dev/null +++ b/benchmarks/models/llama/_eval.py @@ -0,0 +1,362 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import lm_eval +import torch +import torch.nn.functional as F + +from torchao.quantization.GPTQ_MT import MultiTensor +from torchao.quantization.utils import _MultiInput + +try: # lm_eval version 0.4 + from lm_eval.evaluator import evaluate # pyre-ignore[21] + from lm_eval.models.huggingface import HFLM as eval_wrapper # pyre-ignore[21] + from lm_eval.tasks import get_task_dict # pyre-ignore[21] +except: # lm_eval version 0.3 + from lm_eval import base, evaluator, tasks + + eval_wrapper = base.BaseLM + get_task_dict = tasks.get_task_dict + evaluate = evaluator.evaluate + + +class MultiTensorInputRecorder(eval_wrapper): + def __init__( + self, + tokenizer, + calibration_seq_length, + input_prep_func=None, + pad_calibration_inputs=False, + vocab_size=32000, + pad_token=0, + device="cpu", + ): + try: + super().__init__() + except TypeError: + # lm_eval 0.4.2 removed the default init + super().__init__("gpt2", device="cpu") + + self.tokenizer = tokenizer + self._device = torch.device(device) + self.vocab_size = vocab_size + self._max_seq_length = calibration_seq_length + self.calibration_seq_length = calibration_seq_length + + self.input_prep_func = ( + input_prep_func if input_prep_func is not None else lambda x: (x,) + ) + + self.pad_calibration_inputs = pad_calibration_inputs + self.pad_token = pad_token + + # Initialize inputs as a list of two empty lists for input tensors and indices + self.inputs = [[], []] + + @property + def eot_token_id(self): + try: + return self.tokenizer.eos_id() + except: + return self.tokenizer.eos_id + + @property + def max_length(self): + return self._max_seq_length + + @property + def max_gen_toks(self): + return 50 + + @property + def batch_size(self): + return 1 + + @property + def device(self): + return self._device + + def tok_encode(self, string: str, **kwargs): + tokens = self.tokenizer.encode(string) + if hasattr(self.tokenizer, "bos_id"): + try: + tokens = [self.tokenizer.bos_id()] + tokens + except: + tokens = [self.tokenizer.bos_id] + tokens + return tokens + + def tok_decode(self, tokens): + decoded = self.tokenizer.decode(tokens) + return decoded + + def add_input(self, args): + # Ensure that inputs are added correctly as pairs + self.inputs[0].append(args[0]) + self.inputs[1].append(args[1]) + + def record_inputs(self, calibration_tasks, calibration_limit): + try: + lm_eval.tasks.initialize_tasks() + except: + pass + + task_dict = get_task_dict(calibration_tasks) + print("Obtaining GPTQ calibration inputs on: ", calibration_tasks) + + evaluate( + self, + task_dict, + limit=calibration_limit, + ) + return self + + def get_inputs(self): + # Return MultiTensor instances for both inputs and indices + return [MultiTensor(self.inputs[0]), MultiTensor(self.inputs[1])] + + def _model_call(self, inps): + inps = inps.squeeze(0) + T = len(inps) + if ( + # Can't use inputs that are too short when padding is disabled + (T < self.calibration_seq_length and not self.pad_calibration_inputs) + or + # Can't use inputs that actually use the token we use for padding + (self.pad_calibration_inputs and self.pad_token in inps) + ): + # Give random output + return torch.randn( + (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device + ) + + # Pad or truncate to the correct size + if T >= self.calibration_seq_length: + inps = inps[: self.calibration_seq_length] + else: + inps = F.pad( + inps, (0, self.calibration_seq_length - T), value=self.pad_token + ) + + inps = inps.unsqueeze(0) + model_in = self.input_prep_func(inps) + + self.add_input(model_in) + + # Output `something` with the correct shape to keep eval going + return torch.randn( + (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device + ) + + def _model_generate(self, context, max_length, eos_token_id): + raise Exception("unimplemented") + + +class InputRecorder(eval_wrapper): + """ + This is a fake evaluation wrapper from the lm_eval library that just records the inputs + so that they can be used in calibration. + + If pad_calibration_inputs is enabled, the input recorder will take + each input and pad/truncate it down to the calibration_seq_length. + (if using padding you should set the embeddings for the pad_token to 0 + in the model) + + Note: after padding/truncation, input_prep_function is called to bring + it to the proper form to be inserted into a given model. + + If not, it will only truncate inputs to the desired length. + """ + + def __init__( + self, + tokenizer, + calibration_seq_length, + input_prep_func=None, + pad_calibration_inputs=False, + vocab_size=32000, + pad_token=0, + device="cpu", + ): + try: + super().__init__() + except TypeError: + # lm_eval 0.4.2 removed the default init + super().__init__("gpt2", device="cpu") + + self.tokenizer = tokenizer + self._device = torch.device(device) + self.vocab_size = vocab_size + self._max_seq_length = calibration_seq_length + self.calibration_seq_length = calibration_seq_length + + # need to take inps and convert to corrent input + # for model + self.input_prep_func = ( + input_prep_func if input_prep_func is not None else lambda x: (x,) + ) + + self.pad_calibration_inputs = pad_calibration_inputs + self.pad_token = pad_token + + self.inputs = None + + @property + def eot_token_id(self): + try: + return self.tokenizer.eos_id() + except: + return self.tokenizer.eos_id + + @property + def max_length(self): + return self._max_seq_length + + @property + def max_gen_toks(self): + return 50 + + @property + def batch_size(self): + return 1 + + @property + def device(self): + return self._device + + def tok_encode(self, string: str, **kwargs): + # TODO: verify this for multi-batch as well + tokens = self.tokenizer.encode(string) + if hasattr(self.tokenizer, "bos_id"): + try: + tokens = [self.tokenizer.bos_id()] + tokens + except: + tokens = [self.tokenizer.bos_id] + tokens + return tokens + + def tok_decode(self, tokens): + decoded = self.tokenizer.decode(tokens) + return decoded + + def add_input(self, args): + if self.inputs is None: + self.inputs = [_MultiInput([arg]) for arg in args] + else: + self.inputs = [ + multi.add_input(arg) for (multi, arg) in zip(self.inputs, args) + ] + + def record_inputs( + self, + calibration_tasks, + calibration_limit, + ): + try: + lm_eval.tasks.initialize_tasks() + except: + pass + + task_dict = get_task_dict(calibration_tasks) + print("Obtaining GPTQ calibration inputs on: ", calibration_tasks) + + evaluate( + self, + task_dict, + limit=calibration_limit, + ) + return self + + def get_inputs(self): + return self.inputs + + def _model_call(self, inps): + inps = inps.squeeze(0) + T = len(inps) + if ( + # can't use inputs that are too short when padding disabled + (T < self.calibration_seq_length and not self.pad_calibration_inputs) + or + # can't use inputs that actually use token we use for padding + (self.pad_calibration_inputs and self.pad_token in inps) + ): + # give random output + return torch.randn( + (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device + ) + + # pad or truncate to the right size + if T >= self.calibration_seq_length: + inps = inps[: self.calibration_seq_length] + else: + inps = F.pad(inps, (self.pad_token, self.calibration_seq_length - T)) + + inps = inps.unsqueeze(0) + model_in = self.input_prep_func(inps) + + self.add_input(model_in) + + # output `something` with correct shape to keep eval going + return torch.randn( + (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device + ) + + def _model_generate(self, context, max_length, eos_token_id): + raise Exception("unimplemented") + + +class TransformerEvalWrapper(InputRecorder): + """ + A wrapper class for GPTFast, providing integration with the lm-evaluation-harness library. + """ + + def __init__( + self, model, tokenizer, max_seq_length, input_prep_func=None, device="cuda" + ): + super().__init__(tokenizer, None) + self._model = model + # self.tokenizer = tokenizer + self._device = torch.device(device) + self._max_seq_length = max_seq_length + + # need to take inps and convert to corrent input + # for model + self.input_prep_func = ( + input_prep_func if input_prep_func is not None else lambda x: (x,) + ) + + def _model_call(self, inps): + # TODO: make batches work + input = self.input_prep_func(inps) + + max_seq_length = min(max(inps.size()), self.max_length) + with torch.device(self._device): + self._model.setup_caches(self.batch_size, max_seq_length) + logits = self._model(*input) + return logits + + def _model_generate(self, context, max_length, eos_token_id): + raise Exception("unimplemented") + + def run_eval(self, tasks, limit): + try: + lm_eval.tasks.initialize_tasks() + except: + pass + + task_dict = get_task_dict(tasks) + print("Evaluating Model On: ", task_dict) + with torch.no_grad(): + result = evaluate( + self, + task_dict, + limit=limit, + ) + for task, res in result["results"].items(): + print(f"{task}: {res}") + return result diff --git a/benchmarks/models/llama/benchmark_results.txt b/benchmarks/models/llama/benchmark_results.txt new file mode 100644 index 0000000000..d59c5f552e --- /dev/null +++ b/benchmarks/models/llama/benchmark_results.txt @@ -0,0 +1,52 @@ +README BENCHMARKS +llama 2 +20240831225155, tok/s=107.38, mem/s=1418.93 GB/s, peak_mem=13.88 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831225810, tok/s= 9.61, mem/s= 63.67 GB/s, peak_mem= 8.61 GB, model_size= 6.62 GB quant: int8dq, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8dq --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831230013, tok/s=170.83, mem/s=1131.18 GB/s, peak_mem= 8.95 GB, model_size= 6.62 GB quant: int8wo, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910152454, tok/s=117.89, mem/s= 584.57 GB/s, peak_mem= 6.52 GB, model_size= 4.96 GB quant: fp6, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831230205, tok/s=201.14, mem/s= 751.42 GB/s, peak_mem= 4.87 GB, model_size= 3.74 GB quant: int4wo-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831230736, tok/s=177.45, mem/s=1194.35 GB/s, peak_mem= 8.64 GB, model_size= 6.73 GB quant: autoquant, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240902100527, tok/s=209.19, mem/s= 804.32 GB/s, peak_mem= 4.89 GB, model_size= 3.84 GB quant: autoquant-int4, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant-int4 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 + +llama 3 +20240831232535, tok/s= 95.64, mem/s=1435.54 GB/s, peak_mem=16.43 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831233224, tok/s= 8.61, mem/s= 64.75 GB/s, peak_mem= 9.24 GB, model_size= 7.52 GB quant: int8dq, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8dq --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831233853, tok/s=153.03, mem/s=1150.80 GB/s, peak_mem=10.42 GB, model_size= 7.52 GB quant: int8wo, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int8wo --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910153353, tok/s=161.58, mem/s= 910.02 GB/s, peak_mem= 7.72 GB, model_size= 5.63 GB quant: fp6, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831234218, tok/s=180.80, mem/s= 763.33 GB/s, peak_mem= 6.88 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831235355, tok/s=158.10, mem/s=1193.24 GB/s, peak_mem=10.04 GB, model_size= 7.55 GB quant: autoquant, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240902101015, tok/s=188.41, mem/s= 800.58 GB/s, peak_mem= 7.14 GB, model_size= 4.25 GB quant: autoquant-int4, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization autoquant-int4 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 + +KV CACHE QUANTIZATION: +20240826161508, tok/s= 19.71, mem/s= 295.80 GB/s, peak_mem=17.86 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 8192 +20240826161747, tok/s= 13.52, mem/s= 202.96 GB/s, peak_mem=17.52 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 8192--kv_cache_quantization +20240826162028, tok/s= 13.30, mem/s= 199.66 GB/s, peak_mem=17.47 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 8192--kv_cache_quantization --linear_causal_mask +20240826162318, tok/s= 12.54, mem/s= 188.22 GB/s, peak_mem=19.81 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 16384 +20240826162620, tok/s= 10.67, mem/s= 160.12 GB/s, peak_mem=18.75 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 16384--kv_cache_quantization +20240826162920, tok/s= 10.57, mem/s= 158.67 GB/s, peak_mem=18.48 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 16384--kv_cache_quantization --linear_causal_mask +20240826163307, tok/s= 7.11, mem/s= 106.75 GB/s, peak_mem=23.83 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 32768 +20240826163710, tok/s= 6.33, mem/s= 94.98 GB/s, peak_mem=21.72 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 32768--kv_cache_quantization +20240826164117, tok/s= 6.20, mem/s= 93.02 GB/s, peak_mem=20.64 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 32768--kv_cache_quantization --linear_causal_mask +20240826164715, tok/s= 3.72, mem/s= 55.84 GB/s, peak_mem=33.50 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 65536 +20240826165343, tok/s= 3.32, mem/s= 49.90 GB/s, peak_mem=29.54 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 65536--kv_cache_quantization +20240826170011, tok/s= 3.31, mem/s= 49.71 GB/s, peak_mem=25.24 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 65536--kv_cache_quantization --linear_causal_mask +20240826171015, tok/s= 1.95, mem/s= 29.21 GB/s, peak_mem=59.27 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072 +20240826172121, tok/s= 1.73, mem/s= 26.02 GB/s, peak_mem=52.62 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072--kv_cache_quantization +20240826173230, tok/s= 1.73, mem/s= 25.95 GB/s, peak_mem=34.18 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3.1-8B, kv_quant: True, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 --cache_size 131072--kv_cache_quantization --linear_causal_mask + +OTHER BENCHMARKS +20240831224311, tok/s= 26.75, mem/s= 707.01 GB/s, peak_mem=27.23 GB, model_size=26.43 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float32, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float32 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831224512, tok/s= 22.97, mem/s= 303.53 GB/s, peak_mem=13.64 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831224958, tok/s=108.48, mem/s=1433.57 GB/s, peak_mem=13.90 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910004030, tok/s= 22.72, mem/s= 112.66 GB/s, peak_mem=10.41 GB, model_size= 4.96 GB quant: fp6, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910004539, tok/s= 50.99, mem/s= 200.08 GB/s, peak_mem= 6.29 GB, model_size= 3.92 GB quant: uintx-4-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910005147, tok/s= 40.25, mem/s= 265.95 GB/s, peak_mem= 9.24 GB, model_size= 6.61 GB quant: uintx-2-8, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-2-8 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910110554, tok/s=245.07, mem/s= 657.93 GB/s, peak_mem= 4.05 GB, model_size= 2.68 GB quant: sparse-marlin, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 + +20240831231514, tok/s= 26.54, mem/s= 796.59 GB/s, peak_mem=32.34 GB, model_size=30.02 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float32, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float32 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831231725, tok/s= 23.67, mem/s= 355.33 GB/s, peak_mem=16.19 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240831232327, tok/s= 96.59, mem/s=1449.85 GB/s, peak_mem=16.43 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910005537, tok/s= 20.22, mem/s= 113.89 GB/s, peak_mem=23.17 GB, model_size= 5.63 GB quant: fp6, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization fp6 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910010056, tok/s= 47.85, mem/s= 213.24 GB/s, peak_mem=11.85 GB, model_size= 4.46 GB quant: uintx-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910010647, tok/s= 34.83, mem/s= 261.42 GB/s, peak_mem=14.99 GB, model_size= 7.51 GB quant: uintx-2-8, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization uintx-2-8 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 +20240910110958, tok/s=223.95, mem/s= 682.88 GB/s, peak_mem= 5.59 GB, model_size= 3.05 GB quant: sparse-marlin, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8 diff --git a/benchmarks/models/llama/benchmarks.sh b/benchmarks/models/llama/benchmarks.sh new file mode 100644 index 0000000000..4c11b193d5 --- /dev/null +++ b/benchmarks/models/llama/benchmarks.sh @@ -0,0 +1,118 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder + +# README BENCHMARKS +export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt + +export MODEL_REPO=meta-llama/Meta-Llama-3-8B +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-int4 --write_result benchmark_results.txt + +export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt +# Runs on H100, float8 is not supported on CUDA arch < 8.9 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8wo --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8dq-tensor --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization float8dq-wo --write_result benchmark_results.txt + +# OTHER BENCHMARKS + +# kv cache quantization +export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization --linear_causal_mask +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 16384 --kv_cache_quantization --linear_causal_mask +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 32768 --kv_cache_quantization --linear_causal_mask +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 65536 --kv_cache_quantization --linear_causal_mask +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 131072 --kv_cache_quantization --linear_causal_mask + +export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt + +export MODEL_REPO=meta-llama/Meta-Llama-3-8B +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt + +# Different Batch Size Benchmarks +export MODEL_REPO=meta-llama/Meta-Llama-3-8B +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 1 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 128 + +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 1 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 128 + +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 1 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoquant --write_result benchmark_results.txt --batch_size 128 + +# TTFT benchmarks +export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result benchmark_results.txt --prefill_size 8000 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq --write_result benchmark_results.txt --prefill_size 8000 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8wo --write_result benchmark_results.txt --prefill_size 8000 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq --sparsity semi-structured --write_result benchmark_results.txt --prefill_size 8000 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization float8dq --write_result benchmark_results.txt --prefill_size 8000 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization float8wo --write_result benchmark_results.txt --prefill_size 8000 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int4wo-64 --write_result benchmark_results.txt --prefill_size 8000 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization sparse-marlin --write_result benchmark_results.txt --prefill_size 8000 --precision float16 --sparsity semi-structured + +# gemlite benchmarks +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-64 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-64 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-None --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-None --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-8-None --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-8-None --write_result benchmark_results.txt + +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-64 --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-64 --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-None --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-None --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-8-None --write_result benchmark_results.txt --batch_size 32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-8-None --write_result benchmark_results.txt --batch_size 32 + +# 2:4 sparse model +export MODEL_REPO=nm-testing/SparseLlama-3-8B-pruned_50.2of4 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --sparsity semi-structured --precision float16 --write_result benchmark_results.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result benchmark_results.txt diff --git a/benchmarks/models/llama/bsr_bench_results.txt b/benchmarks/models/llama/bsr_bench_results.txt new file mode 100644 index 0000000000..09c4d6cbcd --- /dev/null +++ b/benchmarks/models/llama/bsr_bench_results.txt @@ -0,0 +1,27 @@ + +20250226151422, tok/s=133.29, tok/s_decode=134.40, ttft=0.0118, mem/s=2000.68 GB/s, peak_mem=16.30 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226151926, tok/s=242.08, tok/s_decode=256.68, ttft=0.0464, mem/s=1182.14 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226152416, tok/s=252.18, tok/s_decode=267.48, ttft=0.0448, mem/s=1229.49 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226153215, tok/s=204.19, tok/s_decode=213.86, ttft=0.0438, mem/s=1226.65 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226153628, tok/s=180.14, tok/s_decode=187.54, ttft=0.0433, mem/s=1081.56 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226160622, tok/s=246.20, tok/s_decode=255.21, ttft=0.0281, mem/s= 956.89 GB/s, peak_mem= 5.56 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226160651, tok/s=145.07, tok/s_decode=163.13, ttft=0.1522, mem/s=1461.87 GB/s, peak_mem=22.76 GB, model_size=10.08 GB quant: None, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 + +20250226161533, tok/s=250.71, tok/s_decode=254.78, ttft=0.0121, mem/s= 974.38 GB/s, peak_mem= 5.56 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226161913, tok/s=251.19, tok/s_decode=254.95, ttft=0.0112, mem/s= 976.26 GB/s, peak_mem= 5.63 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226181326, tok/s=134.44, tok/s_decode=140.82, ttft=0.0669, mem/s= 807.62 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226181520, tok/s=138.03, tok/s_decode=164.08, ttft=0.2295, mem/s=1390.97 GB/s, peak_mem=22.74 GB, model_size=10.08 GB quant: None, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226181738, tok/s=192.65, tok/s_decode=205.62, ttft=0.0649, mem/s=1157.32 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226182045, tok/s=192.75, tok/s_decode=206.24, ttft=0.0673, mem/s=1157.27 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226182350, tok/s=236.36, tok/s_decode=257.62, ttft=0.0693, mem/s=1154.19 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226182712, tok/s=231.24, tok/s_decode=250.55, ttft=0.0661, mem/s=1127.37 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226183255, tok/s=169.58, tok/s_decode=179.82, ttft=0.0665, mem/s=1018.74 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226183527, tok/s=184.74, tok/s_decode=196.38, ttft=0.0637, mem/s=1109.18 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226183734, tok/s=232.60, tok/s_decode=252.51, ttft=0.0673, mem/s=1135.85 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250226183953, tok/s=232.47, tok/s_decode=251.15, ttft=0.0635, mem/s=1133.40 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250227084325, tok/s=200.72, tok/s_decode=210.91, ttft=0.0475, mem/s=1205.82 GB/s, peak_mem= 8.00 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250227084708, tok/s=211.76, tok/s_decode=222.43, ttft=0.0447, mem/s=1271.42 GB/s, peak_mem= 7.99 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250227085051, tok/s=241.09, tok/s_decode=255.19, ttft=0.0452, mem/s=1177.31 GB/s, peak_mem= 6.47 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250227085446, tok/s=247.53, tok/s_decode=262.94, ttft=0.0468, mem/s=1206.80 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250227090411, tok/s=250.11, tok/s_decode=263.99, ttft=0.0416, mem/s=1219.39 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 +20250227091144, tok/s=249.14, tok/s_decode=263.74, ttft=0.0439, mem/s=1214.68 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 diff --git a/benchmarks/models/llama/bsr_benchmarks.sh b/benchmarks/models/llama/bsr_benchmarks.sh new file mode 100644 index 0000000000..1ce4da8759 --- /dev/null +++ b/benchmarks/models/llama/bsr_benchmarks.sh @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +# BSR benchmarks +export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder +export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B + +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result bsr_bench_results.txt +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result bsr_bench_results.txt +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --sparsity semi-structured --precision float16 --write_result bsr_bench_results.txt +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.8-32 +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.8-64 +# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.9-32 +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.9-64 diff --git a/benchmarks/models/llama/demo_summarize.sh b/benchmarks/models/llama/demo_summarize.sh new file mode 100644 index 0000000000..6a9a11e254 --- /dev/null +++ b/benchmarks/models/llama/demo_summarize.sh @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +# grab moby dick prompt +wget -nc -O moby.txt https://gist.githubusercontent.com/jcaip/f319146bb543e92e23b2c76815b0f29f/raw/31a9cd12b0b59f323eb197c9534953bdac352986/gistfile1.txt + +export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B-Instruct + +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq_prefill_wo_decode --prefill_size 8192 --max_new_tokens 256 --num_samples 1 --demo_summarize_prompt moby.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8wo --prefill_size 8192 --max_new_tokens 256 --num_samples 1 --demo_summarize_prompt moby.txt +python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization int8dq --prefill_size 8192 --max_new_tokens 256 --num_samples 1 --demo_summarize_prompt moby.txt diff --git a/benchmarks/models/llama/eval.py b/benchmarks/models/llama/eval.py new file mode 100644 index 0000000000..4a67124a08 --- /dev/null +++ b/benchmarks/models/llama/eval.py @@ -0,0 +1,355 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +import time +from pathlib import Path +from typing import List, Optional + +import torch +from generate import ( + _load_model, + device_sync, +) +from tokenizer import get_tokenizer + +import torchao +from torchao._models.llama.model import prepare_inputs_for_model +from torchao.quantization import ( + PerRow, + PerTensor, + float8_dynamic_activation_float8_weight, + float8_weight_only, + fpx_weight_only, + int4_weight_only, + int8_dynamic_activation_int8_weight, + int8_weight_only, + quantize_, + uintx_weight_only, +) +from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, unwrap_tensor_subclass + + +def run_evaluation( + checkpoint_path: Path, + tasks: List[str], + limit: Optional[int] = None, + device="cuda", + precision=torch.bfloat16, + quantization: Optional[str] = None, + sparsity: Optional[str] = None, + compile=False, + max_length=None, + calibration_tasks: Optional[List[str]] = None, + calibration_limit: Optional[int] = None, + calibration_seq_length: Optional[int] = None, + pad_calibration_inputs: Optional[bool] = False, +): + """Runs the evaluation of a model using LM Eval.""" + print( + f"\nEvaluating model {checkpoint_path} on tasks: {tasks}, limit: {limit}, device: {device}, precision: {precision}, " + + f"quantization: {quantization}, sparsity: {sparsity}, compile: {compile}, max_length: {max_length}, calibration_tasks: {calibration_tasks}, " + + f"calibration_seq_length: {calibration_seq_length}, pad_calibration_inputs: {pad_calibration_inputs}\n" + ) + torchao.quantization.utils.recommended_inductor_config_setter() + + assert checkpoint_path.is_file(), checkpoint_path + tokenizer_path = checkpoint_path.parent / "tokenizer.model" + assert tokenizer_path.is_file(), str(tokenizer_path) + # Load Model and Tokenizer + print("Loading model ...") + t0 = time.time() + model = _load_model(checkpoint_path, "cpu", precision) + + if max_length is None: + max_length = model.config.block_size + device_sync(device=device) # MKG + print(f"Time to load model: {time.time() - t0:.02f} seconds") + tokenizer = get_tokenizer(tokenizer_path, checkpoint_path) + + if quantization: + if "spinquant" in quantization: + from torchao.prototype.spinquant import apply_spinquant + + apply_spinquant(model) + if "int8wo" in quantization: + quantize_(model, int8_weight_only()) + if "int8dq" in quantization: + quantize_(model, int8_dynamic_activation_int8_weight()) + if "fp6" in quantization: + quantize_(model, fpx_weight_only(3, 2)) + if "int4wo" in quantization and not "gptq" in quantization: + if "hqq" in quantization: + use_hqq = True + else: + use_hqq = False + groupsize = int(quantization.split("-")[1]) + assert ( + groupsize in [32, 64, 128, 256] + ), f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" + quantize_( + model.to(device), + int4_weight_only(group_size=groupsize, use_hqq=use_hqq), + ) + if "uintx" in quantization: + # uintx-nbits-groupsize + # "uintx-2-64" + if "hqq" in quantization: + use_hqq = True + else: + use_hqq = False + _quant_args = quantization.split("-") + nbits = int(_quant_args[1]) + _NBITS_TO_DTYPE = { + 1: torch.uint1, + 2: torch.uint2, + 3: torch.uint3, + 4: torch.uint4, + 5: torch.uint5, + 6: torch.uint6, + 7: torch.uint7, + 8: torch.uint8, + } + dtype = _NBITS_TO_DTYPE[nbits] + group_size = int(_quant_args[2]) + quantize_(model, uintx_weight_only(dtype, group_size, use_hqq=use_hqq)) + if "marlin" in quantization: + from torchao.dtypes import MarlinSparseLayout + + quantize_(model, int4_weight_only(layout=MarlinSparseLayout())) + if "int4wo" in quantization and "gptq" in quantization: + # avoid circular imports + from torchao._models._eval import MultiTensorInputRecorder + from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer + + groupsize = int(quantization.split("-")[-2]) + assert ( + groupsize in [32, 64, 128, 256] + ), f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" + assert ( + precision == torch.bfloat16 + ), f"{quantization} requires precision or bfloat16 but got {precision}" + assert "cuda" in device, "int4 gptq quantization only works on cuda" + inputs = ( + MultiTensorInputRecorder( + tokenizer, + calibration_seq_length, + prepare_inputs_for_model, + pad_calibration_inputs, + model.config.vocab_size, + device="cpu", + ) + .record_inputs( + calibration_tasks, + calibration_limit, + ) + .get_inputs() + ) + + quantizer = Int4WeightOnlyGPTQQuantizer(group_size=groupsize, device=device) + model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length) + model = quantizer.quantize(model, inputs).to(device) + else: + if not TORCH_VERSION_AT_LEAST_2_5: + unwrap_tensor_subclass(model) + if "float8wo" in quantization: + quantize_(model, float8_weight_only()) + if "float8dq" in quantization: + granularity = str(quantization.split("-")[-1]) + if granularity == "tensor": + granularity = PerTensor() + elif granularity == "row": + granularity = PerRow() + else: + if granularity == "float8dq": + granularity = PerTensor() + else: + raise ValueError(f"Unknown granularity {granularity}") + quantize_( + model, float8_dynamic_activation_float8_weight(granularity=granularity) + ) + if "autoround" in quantization: + from transformers import AutoTokenizer + + from torchao._models.llama.model import TransformerBlock + from torchao.prototype.autoround.autoround_llm import ( + quantize_model_with_autoround_, + ) + + _tokenizer = AutoTokenizer.from_pretrained(checkpoint_path.parent) + # parse args from quantization string: + # autoround--------- + _quant_args = quantization.split("-") + _default_quant_args = [False, 200, 128, 8, 2048, 128, 1, 0] + _model_devie = _quant_args[1] if len(_quant_args) > 1 else device + _quant_args = _quant_args[2:] + ( + quant_lm_head, + iters, + groupsize, + batch_size, + seqlen, + nsamples, + grad_acc_steps, + compile_optimization_process, + ) = [int(x) for x in _quant_args] + _default_quant_args[len(_quant_args) :] + model = model.to(_model_devie) + print( + ( + f"Quantizing model with autoround(iters={iters}, groupsize={groupsize}, " + f"quant_lm_head={quant_lm_head}, batch_size={batch_size}, seqlen={seqlen}, nsamples={nsamples}, " + f"gradient_accumulate_steps={grad_acc_steps}, " + f"compile_optimization_process={compile_optimization_process})" + ) + ) + with torch.device(_model_devie): + model.setup_caches( + max_batch_size=batch_size, max_seq_length=seqlen, training=True + ) + + if quant_lm_head: + is_target_module = ( + lambda mod, fqn: isinstance(mod, TransformerBlock) + or "output" in fqn + ) + else: + is_target_module = lambda mod, fqn: isinstance(mod, TransformerBlock) + quantize_model_with_autoround_( + model=model, + tokenizer=_tokenizer, + is_target_module=is_target_module, + bits=4, + seqlen=seqlen, + batch_size=batch_size, + iters=iters, + nsamples=nsamples, + gradient_accumulate_steps=grad_acc_steps, + compile_optimization_process=compile_optimization_process == 1, + ) + model.to(device) + model.reset_caches() + if "codebook" in quantization: + from torchao.prototype.quantization.codebook import codebook_weight_only + + model.to(device) + quantize_( + model, codebook_weight_only(dtype=torch.uint4, scale_block_size=64) + ) + + if compile: + model = torch.compile(model, mode="max-autotune", fullgraph=True) + with torch.no_grad(): + print("Running evaluation ...") + # avoid circular imports + from torchao._models._eval import TransformerEvalWrapper + + TransformerEvalWrapper( + model=model.to(device), + tokenizer=tokenizer, + max_seq_length=max_length, + input_prep_func=prepare_inputs_for_model, + device=device, + ).run_eval( + tasks=tasks, + limit=limit, + ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Run HF Model Evaluation") + parser.add_argument( + "--checkpoint_path", + type=Path, + default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"), + help="Model checkpoint path.", + ) + parser.add_argument( + "--tasks", + nargs="+", + type=str, + default=["wikitext"], + help="List of lm-eluther tasks to evaluate usage: --tasks task1 task2", + ) + parser.add_argument( + "--limit", type=int, default=None, help="Number of eval samples to evaluate" + ) + parser.add_argument( + "--precision", + type=lambda x: getattr(torch, x.split(".")[-1]), + default=torch.bfloat16, + help="dtype precision to use", + ) + parser.add_argument( + "--device", type=str, default="cuda", help="Device to use for evaluation" + ) + parser.add_argument( + "-q", + "--quantization", + type=str, + help=( + "Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-, " + "int4wo--gptq, autoquant, autoquant-int4, int4wo--hqq, " + "uintx--, uintx---hqq, sparse-marlin, spinquant, " + "autoround---------, " + "float8wo, float8dq, float8saq" + ), + ) + parser.add_argument( + "--sparsity", + type=str, + help=("Which sparsity techniques to apply: semi-structured"), + ) + parser.add_argument( + "--compile", action="store_true", help="Whether to compile the model." + ) + parser.add_argument( + "--max_length", + type=int, + default=None, + help="Length of text to process at one time", + ) + parser.add_argument( + "--calibration_tasks", + type=str, + nargs="+", + default=["wikitext"], + help="tasks to do gptq calibration on, if doing gptq", + ) + parser.add_argument( + "--calibration_limit", + type=int, + default=1000, + help="number of samples to use for gptq calibration", + ) + parser.add_argument( + "--calibration_seq_length", + type=int, + default=100, + help="length of sequences to use for gptq calibration", + ) + parser.add_argument( + "--pad_calibration_inputs", + type=bool, + default=False, + help="pads sequences shorter than calibration_seq_length to that length, yielding more calibration inputs but running much slower", + ) + + args = parser.parse_args() + run_evaluation( + args.checkpoint_path, + args.tasks, + args.limit, + args.device, + args.precision, + args.quantization, + args.sparsity, + args.compile, + args.max_length, + args.calibration_tasks, + args.calibration_limit, + args.calibration_seq_length, + args.pad_calibration_inputs, + ) diff --git a/benchmarks/models/llama/evals.sh b/benchmarks/models/llama/evals.sh new file mode 100644 index 0000000000..b4c32644a7 --- /dev/null +++ b/benchmarks/models/llama/evals.sh @@ -0,0 +1,28 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder + +export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround # auto-round w/o quant_lm_head +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head + +export MODEL_REPO=meta-llama/Meta-Llama-3-8B +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu # auto-round w/o quant_lm_head +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head + +export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu # auto-round w/o quant_lm_head +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64 +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8wo +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-tensor +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-row + +# Testing on additional tasks +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo --tasks 'winogrande' 'arc_challenge' +python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo --tasks 'mmlu' 'truthfulqa_mc2' diff --git a/benchmarks/models/llama/generate.py b/benchmarks/models/llama/generate.py new file mode 100644 index 0000000000..6be6a31e0a --- /dev/null +++ b/benchmarks/models/llama/generate.py @@ -0,0 +1,1255 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +import sys +import time +from datetime import datetime +from pathlib import Path +from typing import Optional, Tuple + +import torch +import torch._dynamo.config +import torch._inductor.config + +import torchao +from torchao._models.utils import ( + get_arch_name, + write_json_result_local, + write_json_result_ossci, +) +from torchao.quantization.quant_primitives import MappingType +from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, get_model_size_in_bytes + +torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False +torch.backends.cuda.enable_cudnn_sdp(True) + + +class HostEvent: + def __init__(self): + self.event_time = None + + def record(self): + self.event_time = time.perf_counter() + + def elapsed_time(self, other_event): + if self.event_time is None: + raise ValueError("Event not recorded!") + # return ms to match cuda event + return abs(other_event.event_time - self.event_time) * 1000 + + +def device_timer(device): + if "cuda" in device: + return torch.cuda.Event(enable_timing=True) + elif ("cpu" in device) or ("mps" in device): + return HostEvent() + else: + print(f"device={device} is not yet suppported") + + +def device_sync(device): + if "cuda" in device: + torch.cuda.synchronize(device) + elif "xpu" in device: + torch.xpu.synchronize(device) + elif ("cpu" in device) or ("mps" in device): + pass + else: + print(f"device={device} is not yet suppported") + + +default_device = ( + "cuda" + if torch.cuda.is_available() + else "xpu" + if torch.xpu.is_available() + else "cpu" +) + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from torchao._models.llama.model import Transformer, prepare_inputs_for_model +from torchao._models.llama.tokenizer import get_tokenizer + + +def multinomial_sample_one_no_sync( + probs_sort, +): # Does multinomial sampling without a cuda synchronization + q = torch.empty_like(probs_sort).exponential_(1) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + +def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None): + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + pivot = v.select(-1, -1).unsqueeze(-1) + logits = torch.where(logits < pivot, -float("Inf"), logits) + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + +def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None): + probs = logits_to_probs(logits[:, -1], temperature, top_k) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + +def prefill( + model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs +) -> torch.Tensor: + # input_pos: [B, S] + logits = model(x, input_pos) + return sample(logits, **sampling_kwargs)[0] + + +def decode_one_token( + model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs +) -> Tuple[torch.Tensor, torch.Tensor]: + # input_pos: [B, 1] + assert input_pos.shape[-1] == 1 + logits = model(x, input_pos) + return sample(logits, **sampling_kwargs) + + +def decode_n_tokens( + model: Transformer, + cur_token: torch.Tensor, + input_pos: torch.Tensor, + num_new_tokens: int, + callback=lambda _: _, + **sampling_kwargs, +): + new_tokens, new_probs = [], [] + for i in range(num_new_tokens): + with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH): + next_token, next_prob = decode_one_token( + model, cur_token, input_pos, **sampling_kwargs + ) + next_token, next_prob = next_token.clone(), next_prob.clone() + input_pos += 1 + # in some instances not having this causes weird issues with the stored tokens when you run the next decode_one_token step + new_tokens.append(next_token.clone()) + callback(new_tokens[-1]) + new_probs.append(next_prob) + cur_token = next_token + + return new_tokens, new_probs + + +def model_forward(model, x, input_pos): + return model(x, input_pos) + + +@torch.no_grad() +def generate( + model: Transformer, + prompt: torch.Tensor, + max_new_tokens: int, + batch_size: int, + *, + interactive: bool, + callback=lambda x: x, + kv_cache_quantization: bool = False, + cache_size: Optional[int] = None, + linear_causal_mask: bool = False, + prefill_start_event: Optional[torch.cuda.Event] = None, + prefill_end_event: Optional[torch.cuda.Event] = None, + decode_start_event: Optional[torch.cuda.Event] = None, + decode_end_event: Optional[torch.cuda.Event] = None, + **sampling_kwargs, +) -> torch.Tensor: + """ + Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested. + """ + + # create an empty tensor of the expected final shape and fill in the current tokens + device = prompt.device + T = prompt.size(-1) + + # calculate how many tokens to generate based on max_new_tokens and model's upper bound (block_size) + max_seq_length = ( + min(T + max_new_tokens, model.config.block_size) if not interactive else 350 + ) + new_tokens = max_seq_length - T + + # format model input + prompt, input_pos = prepare_inputs_for_model(prompt) + prompt = prompt.repeat(batch_size, 1) # expand prompt based on batchsize + + # full prompt+output will be stored in seq + seq = torch.empty(batch_size, max_seq_length, dtype=prompt.dtype, device=device) + seq[:, :T] = prompt + + # setup model caches + with torch.device(device): + if cache_size is None: + cache_size = max_seq_length + assert ( + cache_size >= max_seq_length + ), "need cache_size to be greater than max_new_tokens + size-of-prompt" + model.setup_caches( + max_batch_size=batch_size, + max_seq_length=cache_size, + kv_cache_quantization=kv_cache_quantization, + linear_causal_mask=linear_causal_mask, + prompt_length=T, + ) + + # execute prefill + if prefill_start_event is not None: + prefill_start_event.record() + next_token = prefill( + model, prompt.view(batch_size, -1), input_pos, **sampling_kwargs + ).clone() + seq[:, T] = next_token.squeeze() + if prefill_end_event is not None: + prefill_end_event.record() + + # execute token generation + if decode_start_event is not None: + decode_start_event.record() + input_pos = torch.tensor([T], device=device, dtype=torch.int) + generated_tokens, _ = decode_n_tokens( + model, + next_token.view(batch_size, -1), + input_pos, + new_tokens - 1, + callback=callback, + **sampling_kwargs, + ) + seq = torch.cat((seq[:, : T + 1], *generated_tokens), dim=-1) + if decode_end_event is not None: + decode_end_event.record() + + return seq + + +def encode_tokens(tokenizer, string, bos=True, device=default_device): + tokens = tokenizer.encode(string) + if bos: + tokens = [tokenizer.bos_id()] + tokens + return torch.tensor(tokens, dtype=torch.int, device=device) + + +def _load_model(checkpoint_path, device, precision): + checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True) + if "model" in checkpoint and "stories" in str(checkpoint_path): + checkpoint = checkpoint["model"] + with torch.device("meta"): + model = Transformer.from_name(checkpoint_path.parent.name) + model.load_state_dict(checkpoint, assign=True) + model = model.to(device=device, dtype=precision) + + return model.eval() + + +B_INST, E_INST = "[INST]", "[/INST]" + + +def main( + prefill_size: Optional[int] = None, + prompt: str = "Hello, my name is", + demo_summarize_prompt: Optional[str] = None, + interactive: bool = False, + num_samples: int = 5, + max_new_tokens: int = 100, + batch_size: int = 1, + top_k: int = 200, + temperature: float = 0.8, + checkpoint_path: Path = Path( + "checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pth" + ), + quantization: Optional[str] = None, + min_sqnr: Optional[float] = None, + sparsity: Optional[str] = None, + kv_cache_quantization: bool = False, + cache_size: Optional[int] = None, + linear_causal_mask: bool = False, + save: bool = False, + compile: bool = True, + compile_prefill: bool = False, + profile: Optional[Path] = None, + memory_profile: Optional[Path] = None, + device=default_device, + precision=torch.bfloat16, + write_result: Optional[Path] = None, + output_json_path: Optional[Path] = None, + output_json_local: bool = False, +) -> None: + """Generates text samples based on a pre-trained Transformer model and tokenizer.""" + + if prefill_size is not None and prefill_size > 0: + # create prompt of prefill size + if demo_summarize_prompt is None: + prompt = "prompt " * (int(prefill_size) - 2) + else: + with open(demo_summarize_prompt, "r") as f: + prompt = f.read() + + torchao.quantization.utils.recommended_inductor_config_setter() + + assert checkpoint_path.is_file(), checkpoint_path + tokenizer_path = checkpoint_path.parent / "tokenizer.model" + assert tokenizer_path.is_file(), str(tokenizer_path) + + print(f"Using device={device}") + is_chat = "chat" in str(checkpoint_path) + + print("Loading model ...") + t0 = time.time() + model = _load_model(checkpoint_path, device, precision) + + device_sync(device=device) # MKG + print(f"Time to load model: {time.time() - t0:.02f} seconds") + + tokenizer = get_tokenizer(tokenizer_path, checkpoint_path) + + encoded = encode_tokens(tokenizer, prompt, bos=True, device=device) + + if demo_summarize_prompt is not None: + end_tag = encode_tokens(tokenizer, "\n ", bos=False, device=device) + encoded = encoded[: prefill_size - end_tag.size(0)] + encoded = torch.cat((encoded, end_tag), dim=0) + + prompt_length = encoded.size(0) + + torch.manual_seed(1234) + + def ffn_only(mod, fqn): + return isinstance(mod, torch.nn.Linear) and "feed_forward" in fqn + + def not_ffn_only(mod, fqn): + return isinstance(mod, torch.nn.Linear) and not ffn_only(mod, fqn) + + def ffn_or_attn_only(mod, fqn): + return isinstance(mod, torch.nn.Linear) and ( + "feed_forward" in fqn or "attention" in fqn + ) + + if quantization: + from torchao.quantization import ( + Float8DynamicActivationFloat8SemiSparseWeightConfig, + autoquant, + float8_dynamic_activation_float8_weight, + float8_weight_only, + fpx_weight_only, + gemlite_uintx_weight_only, + int4_dynamic_activation_int4_weight, + int4_weight_only, + int8_dynamic_activation_int4_weight, + int8_dynamic_activation_int8_weight, + int8_weight_only, + quantize_, + uintx_weight_only, + ) + from torchao.quantization.granularity import PerRow, PerTensor + from torchao.utils import unwrap_tensor_subclass + + if "spinquant" in quantization: + from torchao.prototype.spinquant import apply_spinquant + + apply_spinquant(model) + if quantization.startswith("gemlite"): + import os + import pwd + + from gemlite.core import GemLiteLinearTriton + + _quant_args = quantization.split("-") + bit_width = int(_quant_args[-2]) + group_size = None if _quant_args[-1] == "None" else int(_quant_args[-1]) + try: + packing_bitwidth = int(_quant_args[-3]) + except: + # if only 2 inputs found, use default value + packing_bitwidth = 32 + + quantize_( + model, + gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth), + ) + + # try to load gemlite kernel config + try: + GemLiteLinearTriton.load_config( + f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json" + ) + print( + f"loaded gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json" + ) + except: + print( + f"unable to load gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json" + ) + + print("running gemlite warmup") + generate( + model, + encode_tokens(tokenizer, prompt, bos=True, device=device), + max_new_tokens, + batch_size, + interactive=False, + temperature=temperature, + top_k=top_k, + ) + GemLiteLinearTriton.cache_config( + f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json" + ) + if "int8wo" in quantization: + quantize_(model, int8_weight_only()) + if "int8dq" in quantization: + if sparsity and "semi" in sparsity: + from torchao.dtypes import SemiSparseLayout + + quantize_( + model, + int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()), + filter_fn=ffn_only, + ) + quantize_( + model, int8_dynamic_activation_int8_weight(), filter_fn=not_ffn_only + ) + elif "int8dq_prefill_wo_decode" in quantization: + quantize_( + model, int8_dynamic_activation_int8_weight(weight_only_decode=True) + ) + else: + quantize_(model, int8_dynamic_activation_int8_weight()) + if "int4wo" in quantization: + use_hqq = False + if "hqq" in quantization: + use_hqq = True + group_size = int(quantization.split("-")[1]) + assert ( + group_size + in [ + 32, + 64, + 128, + 256, + ] + ), f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}" + quantize_(model, int4_weight_only(group_size=group_size, use_hqq=use_hqq)) + elif "int4dq-" in quantization: + from torchao.dtypes import CutlassInt4PackedLayout + + nbits = int(quantization.removeprefix("int4dq-")) + assert nbits == 4 or nbits == 8 + if nbits == 4: + quantize_( + model, + int4_dynamic_activation_int4_weight( + mapping_type=MappingType.SYMMETRIC, + act_mapping_type=MappingType.SYMMETRIC, + layout=CutlassInt4PackedLayout(), + ), + ) + elif nbits == 8: + quantize_( + model, + int8_dynamic_activation_int4_weight( + group_size=None, + mapping_type=MappingType.SYMMETRIC, + act_mapping_type=MappingType.SYMMETRIC, + layout=CutlassInt4PackedLayout(), + ), + ) + if "marlin" in quantization: + if "qqq" in quantization: + from torchao.dtypes import MarlinQQQLayout + + quantize_( + model, + int8_dynamic_activation_int4_weight( + group_size=128, + mapping_type=MappingType.SYMMETRIC, + act_mapping_type=MappingType.SYMMETRIC, + layout=MarlinQQQLayout(), + ), + ) + elif "semi" in sparsity: + from torchao.dtypes import MarlinSparseLayout + + quantize_( + model, + int4_weight_only(layout=MarlinSparseLayout()), + filter_fn=ffn_or_attn_only, + ) + if "fp6" in quantization: + quantize_(model, fpx_weight_only(3, 2)) + elif "embed-int8wo" in quantization: + quantize_( + model, + int8_weight_only(group_size=64), + filter_fn=lambda x, *args: isinstance(x, torch.nn.Embedding), + ) + elif quantization.startswith("awq"): + from torchao._models._eval import TransformerEvalWrapper + from torchao.utils import TORCH_VERSION_AT_LEAST_2_3 + + if not TORCH_VERSION_AT_LEAST_2_3: + print("Awq requires torch2.3+") + exit() + from torchao.prototype.awq import ( + AWQObservedLinear, + awq_uintx, + insert_awq_observer_, + ) + + quant_dtype = quantization.split("-")[1] + group_size = int(quantization.split("-")[2]) + quant_dtype = getattr(torch, quant_dtype, torch.uint8) + model = model.to(device) + # get calibration data + insert_awq_observer_( + model, 1, 256, quant_dtype=quant_dtype, group_size=group_size + ) + TransformerEvalWrapper( + model=model.to(device), + tokenizer=tokenizer, + max_seq_length=256, + input_prep_func=prepare_inputs_for_model, + device=device, + ).run_eval( + tasks=["wikitext"], + limit=1, + ) + is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear) + use_hqq = "hqq" in quantization + quantize_( + model, + awq_uintx( + quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq + ), + is_observed_linear, + ) + elif "uintx" in quantization: + # uintx-nbits-group_size, e.g. "uintx-2-64" + if "hqq" in quantization: + # uintx-nbits-group_size-hqq + use_hqq = True + else: + use_hqq = False + _quant_args = quantization.split("-") + nbits = int(_quant_args[1]) + assert nbits >= 1 and nbits <= 8, "nbits must be 1 to 8" + _NBITS_TO_DTYPE = { + 1: torch.uint1, + 2: torch.uint2, + 3: torch.uint3, + 4: torch.uint4, + 5: torch.uint5, + 6: torch.uint6, + 7: torch.uint7, + 8: torch.uint8, + } + dtype = _NBITS_TO_DTYPE[nbits] + group_size = int(_quant_args[2]) + quantize_(model, uintx_weight_only(dtype, group_size, use_hqq=use_hqq)) + elif "int8_dynamic_activation_intx_weight" in quantization: + from torchao.experimental.quant_api import ( + int8_dynamic_activation_intx_weight, + ) + from torchao.quantization.granularity import PerGroup + + assert ( + precision == torch.float32 + ), "int8_dynamic_activation_intx_weight requires using precision=torch.float32" + + # Quantize model + _quant_args = quantization.split("-") + weight_dtype = getattr(torch, f"int{_quant_args[1]}") + granularity = PerGroup(int(_quant_args[2])) + has_weight_zeros = bool(_quant_args[3]) + quantize_( + model, + int8_dynamic_activation_intx_weight( + weight_dtype=weight_dtype, + granularity=granularity, + has_weight_zeros=has_weight_zeros, + ), + ) + elif "float8wo" in quantization: + quantize_(model, float8_weight_only()) + elif "float8dq" in quantization: + if sparsity and "semi" in sparsity: + quantize_( + model, + Float8DynamicActivationFloat8SemiSparseWeightConfig(), + filter_fn=ffn_only, + ) + else: + granularity = str(quantization.split("-")[-1]) + if granularity == "tensor": + granularity = PerTensor() + elif granularity == "row": + granularity = PerRow() + else: + granularity = PerTensor() + quantize_( + model, + float8_dynamic_activation_float8_weight(granularity=granularity), + ) + elif "autoquant_v2" in quantization: + from torchao._models._eval import InputRecorder + from torchao._models.llama.model import prepare_inputs_for_model + from torchao.prototype.quantization.autoquant_v2 import autoquant_v2 + + calibration_seq_length = 256 + inputs = ( + InputRecorder( + tokenizer, + calibration_seq_length, + prepare_inputs_for_model, + False, # pad_calibration_inputs + model.config.vocab_size, + device="cuda", + ) + .record_inputs( + ["wikitext"], + 1, + ) + .get_inputs()[0] + .values[0] + ) + inputs = prepare_inputs_for_model(inputs) + with torch.device("cuda"): + model.setup_caches( + max_batch_size=1, max_seq_length=calibration_seq_length + ) + + if "autoquant_v2-int4" == quantization: + model = autoquant_v2( + model, + manual=True, + qtensor_class_list=torchao.prototype.quantization.autoquant_v2.DEFAULT_INT4_AUTOQUANT_CLASS_LIST, + example_input=inputs, + batch_size=calibration_seq_length, + ) + elif "autoquant_v2-float8" == quantization: + model = autoquant_v2( + model, + manual=True, + qtensor_class_list=torchao.prototype.quantization.autoquant_v2.OTHER_AUTOQUANT_CLASS_LIST, + example_input=inputs, + batch_size=calibration_seq_length, + ) + elif "autoquant_v2-fp" == quantization: + model = autoquant_v2( + model, + manual=True, + qtensor_class_list=torchao.prototype.quantization.autoquant_v2.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, + example_input=inputs, + batch_size=calibration_seq_length, + ) + elif "autoquant_v2-all" == quantization: + all_qtensor_classes = ( + torchao.prototype.quantization.autoquant_v2.DEFAULT_AUTOQUANT_CLASS_LIST + + torchao.prototype.quantization.autoquant_v2.DEFAULT_INT4_AUTOQUANT_CLASS_LIST + + torchao.prototype.quantization.autoquant_v2.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST + ) + if torchao.utils.is_sm_89(): + # this is fp8 related subclasses, should rename + all_qtensor_classes += torchao.prototype.quantization.autoquant_v2.OTHER_AUTOQUANT_CLASS_LIST + model = autoquant_v2( + model, + manual=True, + qtensor_class_list=all_qtensor_classes, + example_input=inputs, + batch_size=calibration_seq_length, + ) + else: + model = autoquant_v2( + model, + manual=True, + example_input=inputs, + batch_size=calibration_seq_length, + ) + + print("running generate") + generate( + model, + encode_tokens(tokenizer, prompt, bos=True, device=device), + max_new_tokens, + batch_size, + interactive=False, + temperature=temperature, + top_k=top_k, + ) + + print("running finalize autoquant") + # do autoquantization + model.finalize_autoquant() + elif "autoquant" in quantization: + from torchao._models._eval import InputRecorder + from torchao._models.llama.model import prepare_inputs_for_model + + calibration_seq_length = 256 + inputs = ( + InputRecorder( + tokenizer, + calibration_seq_length, + prepare_inputs_for_model, + False, # pad_calibration_inputs + model.config.vocab_size, + device="cuda", + ) + .record_inputs( + ["wikitext"], + 1, + ) + .get_inputs()[0] + .values[0] + ) + inputs = prepare_inputs_for_model(inputs) + with torch.device("cuda"): + model.setup_caches( + max_batch_size=1, max_seq_length=calibration_seq_length + ) + + if "autoquant-int4" == quantization: + model = autoquant( + model, + manual=True, + qtensor_class_list=torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST, + example_input=inputs, + min_sqnr=min_sqnr, + ) + elif "autoquant-float8" == quantization: + model = autoquant( + model, + manual=True, + qtensor_class_list=torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST, + example_input=inputs, + min_sqnr=min_sqnr, + ) + elif "autoquant-fp" == quantization: + model = autoquant( + model, + manual=True, + qtensor_class_list=torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, + example_input=inputs, + min_sqnr=min_sqnr, + ) + elif "autoquant-sparse" == quantization: + model = autoquant( + model, + manual=True, + qtensor_class_list=torchao.quantization.DEFAULT_SPARSE_AUTOQUANT_CLASS_LIST, + example_input=inputs, + min_sqnr=min_sqnr, + ) + elif "autoquant-gemlite-int4" == quantization: + import os + import pwd + + from gemlite.core import GemLiteLinearTriton + + GemLiteLinearTriton.load_config( + f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json" + ) + model = autoquant( + model, + manual=True, + qtensor_class_list=torchao.quantization.GEMLITE_INT4_AUTOQUANT_CLASS_LIST, + example_input=inputs, + min_sqnr=min_sqnr, + ) + elif "autoquant-all" == quantization: + try: + import os + import pwd + + from gemlite.core import GemLiteLinearTriton + + GemLiteLinearTriton.load_config( + f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json" + ) + except: + pass + + model = autoquant( + model, + manual=True, + qtensor_class_list=torchao.quantization.ALL_AUTOQUANT_CLASS_LIST, + example_input=inputs, + min_sqnr=min_sqnr, + ) + else: + model = autoquant( + model, manual=True, example_input=inputs, min_sqnr=min_sqnr + ) + + generate( + model, + encode_tokens(tokenizer, prompt, bos=True, device=device), + max_new_tokens, + batch_size, + interactive=False, + temperature=temperature, + top_k=top_k, + ) + + # do autoquantization + model.finalize_autoquant() + elif "codebook" in quantization: + from torchao.prototype.quantization.codebook import codebook_weight_only + + model.to(device) + quantize_( + model, codebook_weight_only(dtype=torch.uint4, scale_block_size=64) + ) + + else: + if not TORCH_VERSION_AT_LEAST_2_5: + unwrap_tensor_subclass(model) + + # standalone sparsity + elif sparsity: + from torchao.sparsity import semi_sparse_weight, sparsify_ + + if "semi" in sparsity: + # Fixed sparsity level for 2:4 + sparsify_(model.to(device), semi_sparse_weight(), filter_fn=ffn_only) + + if "bsr" in sparsity: + from torchao.sparsity import SupermaskLinear, block_sparse_weight + + # parse "bsr-0.9-64" + _, sparsity_level, blocksize = sparsity.split("-") + sparsity_level, blocksize = float(sparsity_level), int(blocksize) + sparsify_( + model, + lambda x: SupermaskLinear.from_linear( + x, + sparsity_level=sparsity_level, + blocksize=blocksize, + ), + filter_fn=ffn_only, + ) + print(model) + sparsify_( + model, + SupermaskLinear.to_linear, + filter_fn=ffn_only, + ) + print(model) + + # Accelerate with triton bsr kernels + sparsify_( + model, block_sparse_weight(blocksize=blocksize), filter_fn=ffn_only + ) + + model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9 + + if save: + output_dir = str(checkpoint_path.cwd()) + filename = str(checkpoint_path.name).split(".")[0] + torch.save( + model.state_dict(), + os.path.join(output_dir, filename + f"-{quantization}.pt"), + ) + + if compile: + print("Compiling Model") + global decode_one_token, prefill + decode_one_token = torch.compile( + decode_one_token, + mode="reduce-overhead", + fullgraph=True, + ) + + if compile_prefill: + prefill = torch.compile(prefill, fullgraph=True, dynamic=True) + + if memory_profile: + if device == "cuda": + torch.cuda.memory._record_memory_history( + True, trace_alloc_max_entries=250000, trace_alloc_record_context=True + ) + elif device == "xpu": + torch.xpu.memory._record_memory_history( + True, trace_alloc_max_entries=250000, trace_alloc_record_context=True + ) + else: + print("Memory profiling only works on CUDA or XPU devices") + + aggregate_metrics = { + "tokens_per_sec": [], + "time": [], + "decode_tokens_per_sec": [], + "prefill_time": [], + } + start = -1 if compile else 0 + + for i in range(start, num_samples): + if i == 0: + if device == "cuda": + torch.cuda.reset_peak_memory_stats() # MKG + elif device == "xpu": + torch.xpu.reset_peak_memory_stats() # MKG + device_sync(device=device) # MKG + if i >= 0 and interactive: + prompt = input("What is your prompt? ") + if is_chat: + prompt = f"{B_INST} {prompt.strip()} {E_INST}" + encoded = encode_tokens(tokenizer, prompt, bos=True, device=device) + + if interactive and i >= 0 and prefill_size is None: + buffer = [] + period_id = tokenizer.encode(".")[0] + done_generating = False + + def callback(x): + nonlocal done_generating + if done_generating: + return + buffer.append(tokenizer.decode([period_id] + x.squeeze(0).tolist())[1:]) + if x.item() == tokenizer.eos_id(): + done_generating = True + if len(buffer) == 4 or done_generating: + print("".join(buffer), end="", flush=True) + buffer.clear() + # print(, end="", flush=True) + + elif demo_summarize_prompt is not None and i >= 0: + buffer = [] + period_id = tokenizer.encode(".")[0] + + def callback(x): + buffer.append(tokenizer.decode([period_id] + x.squeeze(0).tolist())[1:]) + if len(buffer) == 4: + print("".join(buffer), end="", flush=True) + buffer.clear() + else: + callback = lambda x: x + t0 = time.perf_counter() + prefill_start_event, prefill_end_event = ( + device_timer(device), + device_timer(device), + ) + decode_start_event, decode_end_event = ( + device_timer(device), + device_timer(device), + ) + import contextlib + + if i != num_samples - 1 or not profile: + prof = contextlib.nullcontext() + else: + torch.profiler._utils._init_for_cuda_graphs() + prof = torch.profiler.profile() + with prof: + y = generate( + model, + encoded, + max_new_tokens, + batch_size, + interactive=interactive, + callback=callback, + temperature=temperature, + top_k=top_k, + kv_cache_quantization=kv_cache_quantization, + cache_size=cache_size, + linear_causal_mask=linear_causal_mask, + prefill_start_event=prefill_start_event, + prefill_end_event=prefill_end_event, + decode_start_event=decode_start_event, + decode_end_event=decode_end_event, + ) + if i < 0: + print(f"Compilation time: {time.perf_counter() - t0:.2f} seconds") + continue + if hasattr(prof, "export_chrome_trace"): + prof.export_chrome_trace(f"{profile}.json") + device_sync(device=device) # MKG + t = time.perf_counter() - t0 + + if not interactive and demo_summarize_prompt is None and prefill_size is None: + tok_list = y[0].tolist() + # truncate text after end of string token + tokens = ( + tok_list + if tokenizer.eos_id() not in tok_list + else tok_list[: tok_list.index(tokenizer.eos_id())] + ) + print(tokenizer.decode(tokens)) + else: + print("\n") + tokens_generated = y.size(-1) - prompt_length + tokens_sec = tokens_generated / t + aggregate_metrics["tokens_per_sec"].append(tokens_sec) + aggregate_metrics["time"].append(t) + decode_time = decode_start_event.elapsed_time(decode_end_event) / 1000 + decode_tokens_sec = tokens_generated / decode_time + aggregate_metrics["decode_tokens_per_sec"].append(decode_tokens_sec) + prefill_time = prefill_start_event.elapsed_time(prefill_end_event) / 1000 + aggregate_metrics["prefill_time"].append(prefill_time) + print( + f"Sample {i+1} | overall time {t:.04f} s {tokens_sec:.02f} tokens/sec", + f"| prefill time {prefill_time:.04f} s decode {decode_tokens_sec:.02f} tokens/sec", + ) + print(f"Bandwidth achieved: {model_size * tokens_sec:.02f} GB/s") + + if memory_profile and i == 0: + if device == "cuda": + snapshot = torch.cuda.memory._snapshot() + elif device == "xpu": + snapshot = torch.xpu.memory._snapshot() + else: + print("Memory profiling only works on CUDA or XPU devices") + + with open(f"{memory_profile}.pickle", "wb") as f: + from pickle import dump + + dump(snapshot, f) + print( + f"\nmemory profile {memory_profile}.pickle saved, to convert that to a usable file, use", + "python pytorch/torch/cuda/_memory_viz.py trace_plot -o .html", + ) + break + print("==========") + + # ignore first sample for warmup + tokpersec = torch.mean(torch.tensor(aggregate_metrics["tokens_per_sec"])).item() + ttft = torch.mean(torch.tensor(aggregate_metrics["prefill_time"])).item() + decode_tokpersec = torch.mean( + torch.tensor(aggregate_metrics["decode_tokens_per_sec"]) + ).item() + bandwidth = model_size * tokpersec + mem = torch.cuda.max_memory_reserved() / 1e9 + print(f"Average overall tokens/sec: {tokpersec:.2f}") + print(f"Average decode tokens/sec: {decode_tokpersec:.04f} s") + print(f"Average TTFT: {ttft:.04f} s") + if device == "cuda": + mem = torch.cuda.max_memory_reserved() / 1e9 + elif device == "xpu": + mem = torch.xpu.max_memory_reserved() / 1e9 + print(f"Average tokens/sec: {tokpersec:.2f}") + if batch_size > 1: + print(f"Average tokens/sec including batches {batch_size*tokpersec:.2f}") + print(f"Average Bandwidth: {bandwidth:.02f} GB/s") + print(f"Peak Memory Usage: {mem:.02f} GB") + print(f"Model Size: {model_size:.02f} GB") + if write_result: + result_txt = f"\n{datetime.today().strftime('%Y%m%d%H%M%S')}, tok/s={tokpersec:6.2f}, tok/s_decode={decode_tokpersec:6.2f}, ttft={ttft:5.4f}, mem/s={bandwidth:7.2f} GB/s, peak_mem={mem:5.2f} GB, model_size={model_size:5.2f} GB " + result_txt += f"quant: {quantization}, sparse: {sparsity}, mod: {checkpoint_path.parent.name}, kv_quant: {kv_cache_quantization}, compile: {compile}, compile_prefill: {compile_prefill}, dtype: {precision}, device: {device} " + result_txt += "repro: python generate.py " + result_txt += f"--quantization {quantization} " if quantization else "" + result_txt += f"--sparsity {sparsity} " if sparsity else "" + result_txt += f"--checkpoint_path {checkpoint_path} " + result_txt += f"--device {device} " + result_txt += f"--precision {precision} " + result_txt += "--compile " if compile else "" + result_txt += "--compile_prefill " if compile_prefill else "" + result_txt += f"--prefill_size {prefill_size}" if prefill_size else "" + result_txt += f"--profile {profile} " if profile else "" + result_txt += f"--profile {memory_profile} " if memory_profile else "" + result_txt += "--interactive " if interactive else "" + result_txt += f"--num_samples {num_samples} " + result_txt += f"--max_new_tokens {max_new_tokens} " + result_txt += f"--batch_size {batch_size} " + result_txt += f"--top_k {top_k} " + result_txt += f"--temperature {temperature} " + result_txt += f"--cache_size {cache_size}" if cache_size else "" + result_txt += "--kv_cache_quantization " if kv_cache_quantization else "" + result_txt += "--linear_causal_mask " if linear_causal_mask else "" + + f = open(write_result, "a") + f.write(result_txt) + f.close() + + if output_json_path: + headers = [ + "name", + "dtype", + "min_sqnr", + "compile", + "device", + "arch", + "metric", + "actual", + "target", + ] + name = checkpoint_path.parent.name + arch = get_arch_name() + dtype = quantization or "noquant" + memory_result = [ + name, + dtype, + min_sqnr, + compile, + device, + arch, + "mem/s", + bandwidth, + None, + ] + performance_result = [ + name, + dtype, + min_sqnr, + compile, + device, + arch, + "tok/s", + tokpersec, + None, + ] + write_json_result = ( + write_json_result_local if output_json_local else write_json_result_ossci + ) + write_json_result(output_json_path, headers, memory_result) + write_json_result(output_json_path, headers, performance_result) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Your CLI description.") + parser.add_argument( + "--prefill_size", type=int, default=None, help="Whether to run in ttft mode" + ) + parser.add_argument( + "--prompt", type=str, default="Hello, my name is", help="Input prompt." + ) + parser.add_argument( + "--demo_summarize_prompt", type=str, help="Read prompt from text file" + ) + parser.add_argument( + "--interactive", + action="store_true", + help="Whether to launch in interactive mode", + ) + parser.add_argument("--num_samples", type=int, default=5, help="Number of samples.") + parser.add_argument( + "--max_new_tokens", type=int, default=200, help="Maximum number of new tokens." + ) + parser.add_argument( + "--batch_size", type=int, default=1, help="Batch size to benchmark with" + ) + parser.add_argument("--top_k", type=int, default=200, help="Top-k for sampling.") + parser.add_argument( + "--temperature", type=float, default=0.8, help="Temperature for sampling." + ) + parser.add_argument( + "--checkpoint_path", + type=Path, + default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"), + help="Model checkpoint path.", + ) + parser.add_argument( + "-q", + "--quantization", + type=str, + help=( + "Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-, int4wo--hqq, autoquant, " + + "autoquant-int4, autoquant-gemlite-int4, autoquant-float8, autoquant-sparse, autoquant-all, uintx--, uintx---hqq, sparse-marlin, spinquant, " + + "embed-int8wo, marlin_qqq, gemlite---, float8dq, int4dq-" + ), + ) + parser.add_argument( + "--min_sqnr", + type=float, + default=None, + help=( + "min sqnr for quantizing v.s. not quantizing a layer, used in autoquant options", + ), + ) + parser.add_argument( + "-s", + "--sparsity", + type=str, + help=("Which sparsity techniques to apply: semi-structured"), + ) + parser.add_argument( + "--kv_cache_quantization", + action="store_true", + help="Whether to quantize the KV cache", + ) + parser.add_argument( + "--cache_size", + type=int, + default=None, + help="Force size of cache to be a certain number of tokens, if not set, will use max_new_tokens+prompt_size", + ) + parser.add_argument( + "--linear_causal_mask", + action="store_true", + help="Whether to use the memory efficient, but slightly less fast, linear causal mask (important for long context lengths)", + ) + parser.add_argument( + "--save", action="store_true", help="Whether to save the quantized model." + ) + parser.add_argument( + "--compile", action="store_true", help="Whether to compile the model." + ) + parser.add_argument( + "--compile_prefill", + action="store_true", + help="Whether to compile the prefill (improves prefill perf, but higher compile times)", + ) + parser.add_argument("--profile", type=Path, default=None, help="Profile path.") + parser.add_argument( + "--memory_profile", type=Path, default=None, help="filename for memory profile." + ) + parser.add_argument( + "--device", type=str, default=default_device, help="Device to use" + ) + parser.add_argument( + "--precision", + type=lambda x: getattr(torch, x.split(".")[-1]), + default=torch.bfloat16, + help="dtype precision to use", + ) + parser.add_argument( + "--write_result", type=Path, default=None, help="Path where to write the result" + ) + parser.add_argument( + "--output_json_path", + type=Path, + default=None, + help="Path where to write the json result for dashboard", + ) + parser.add_argument( + "--output_json_local", + action="store_true", + help="Whether to output json result for local machine or for CI machine, local option will fill in some dummy fields", + ) + + args = parser.parse_args() + print(args) + main( + args.prefill_size, + args.prompt, + args.demo_summarize_prompt, + args.interactive, + args.num_samples, + args.max_new_tokens, + args.batch_size, + args.top_k, + args.temperature, + args.checkpoint_path, + args.quantization, + args.min_sqnr, + args.sparsity, + args.kv_cache_quantization, + args.cache_size, + args.linear_causal_mask, + args.save, + args.compile, + args.compile_prefill, + args.profile, + args.memory_profile, + args.device, + args.precision, + args.write_result, + args.output_json_path, + args.output_json_local, + ) diff --git a/benchmarks/models/sam/benchmark.sh b/benchmarks/models/sam/benchmark.sh new file mode 100755 index 0000000000..e446f62cbc --- /dev/null +++ b/benchmarks/models/sam/benchmark.sh @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +# baseline +python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --print_header True +# int8 dynamic quant (all) +python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress int8_dynamic_quant +# 2:4 sparsity (all) +python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress sparse_mlp_only +# 2:4 sparsity (mlp only) +python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress sparse +# int8 dynamic quant + 2:4 sparsity (attn: int8, mlp lin1: int8+2:4 fuse mul, mlp lin2: 2:4 sparse) +python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half bfloat16 --device cuda --compress int8_dynamic_quant_sparse +# int8 dynamic quant attn + int4 wo + sparse marlin lin 1 + 2:4 sparse lin2 +python eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 32 --use_compile max-autotune --use_half float16 --device cuda --compress int4_weight_only_sparse diff --git a/benchmarks/models/sam/eval_combo.py b/benchmarks/models/sam/eval_combo.py new file mode 100644 index 0000000000..20f7f3ed2c --- /dev/null +++ b/benchmarks/models/sam/eval_combo.py @@ -0,0 +1,692 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +import math +import resource +import time + +import fire +import torch +import tqdm +from data import build_data, setup_coco_img_ids +from metrics import calculate_miou, create_result_entry + +import torchao +from torchao._models.utils import ( + get_arch_name, + write_json_result_local, + write_json_result_ossci, +) +from torchao.dtypes import SemiSparseLayout +from torchao.prototype.quantization.autoquant_v2 import autoquant_v2 +from torchao.quantization import ( + autoquant, + int4_weight_only, + int8_dynamic_activation_int8_weight, + quantize_, +) +from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_ +from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, unwrap_tensor_subclass + +torch._dynamo.config.cache_size_limit = 50000 + + +def unbind_jagged(device, data, sizes, offsets): + if data is None: + return None + data = data.to(device=device, non_blocking=True) + return [ + data[offsets[batch_idx] : offsets[batch_idx + 1]].view(sizes[batch_idx]) + for batch_idx in range(len(sizes)) + ] + + +PADDED_TENSOR = None + + +# Preallocate a "landing" Tensor for incoming data and reuse it across launches. +def pad_to_batch_size(batch, batch_size, device): + assert batch.dim() == 4 + # assert batch.is_pinned() + global PADDED_TENSOR + if PADDED_TENSOR is None: + batch = batch.to(device=device, non_blocking=True) + full_batch_size = (batch_size, batch.size(1), batch.size(2), batch.size(3)) + first_entry = batch[0].unsqueeze(0) + repeat_first_entry = first_entry.expand(full_batch_size) + padded_batch = torch.cat( + [batch, repeat_first_entry[batch.size(0) : batch_size]], dim=0 + ) + assert padded_batch.size() == full_batch_size + PADDED_TENSOR = padded_batch + PADDED_TENSOR[: batch.size(0)].copy_(batch, non_blocking=True) + return PADDED_TENSOR + + +def get_features_batch( + encoder, input_image_batch, pad_input_image_batch, batch_size, device +): + if pad_input_image_batch: + features_batch = encoder( + pad_to_batch_size(input_image_batch, batch_size, device) + ) + return features_batch[: input_image_batch.size(0)] + return encoder(input_image_batch) + + +def build_results_batch(predictor, batch, batch_size, pad_input_image_batch): + encoder = predictor.model.image_encoder + device = predictor.device + + input_image_batch = batch[0] + # The number of valid data points varies slightly per batch + orig_input_image_batch_size = input_image_batch.size(0) + if input_image_batch is None: + return (None, None, None) + + with torch.autograd.profiler.record_function("data transfer"): + coords_lists = unbind_jagged(*([device] + batch[1:4])) + gt_masks_lists = unbind_jagged(*([device] + batch[4:7])) + if coords_lists is None: + return (None, None, None) + datapoints = list(zip(*(batch[7:] + [coords_lists, gt_masks_lists]))) + if pad_input_image_batch: + # Pad to a static shape to avoid recompilation + input_image_batch = pad_to_batch_size(input_image_batch, batch_size, device) + else: + input_image_batch = input_image_batch.to(device=device, non_blocking=True) + + # We explicitly exclude data transfers from the timing to focus + # only on the kernel performance. + # Next we synchronize and set two events to start timing. + if torch.cuda.is_available(): + torch.cuda.synchronize() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + else: + t0 = time.time() + + with torch.autograd.profiler.record_function("timed region"): + with torch.autograd.profiler.record_function("image encoder"): + features_batch = encoder(input_image_batch) + features_batch = features_batch[:orig_input_image_batch_size] + + with torch.autograd.profiler.record_function("predict_torch"): + result_batch = [] + for batch_idx, ( + anns, + image, + input_size, + idx, + coords, + gt_masks, + ) in enumerate(datapoints): + features = features_batch.narrow(0, batch_idx, 1) + predictor.reset_image() + predictor.original_size = image.shape[:2] + predictor.input_size = input_size + predictor.features = features + predictor.is_image_set = True + coords = coords.unsqueeze(1) + fg_labels = torch.ones( + (coords.size(0), 1), dtype=torch.int, device=device + ) + masks, scores, logits = predictor.predict_torch( + point_coords=coords, + point_labels=fg_labels, + multimask_output=True, + ) + entry = create_result_entry(anns, gt_masks, masks, scores, idx) + result_batch += entry + + # After all kernels have been launched we synchronize again and measure + # the amount of time spent on the GPU. This is a fairly tight measurement + # around the launched GPU kernels and excludes data movement from host + # to device. + if torch.cuda.is_available(): + end_event.record() + torch.cuda.synchronize() + elapsed_time = start_event.elapsed_time(end_event) + else: + elapsed_time = time.time() - t0 + return result_batch, orig_input_image_batch_size, elapsed_time + + +def build_results( + batched_data_iter, + predictor, + mask_debug_out_dir, + batch_size, + use_compile, + use_compile_decoder, + pad_input_image_batch, + compress, + use_fullgraph=False, +): + # TODO: Re-enable this for datapoints + assert not use_compile_decoder + + batch_runner = build_results_batch + + results = [] + batch_idx = 0 + num_images = 0 + num_batches = 0 + elapsed_time = 0 + partial_batch = False + for batch in tqdm.tqdm(batched_data_iter): + with torch.no_grad(): + if batch_idx == 0: + with torch.autograd.profiler.record_function("compilation and warmup"): + if str(use_compile) != "False": + predictor.model.image_encoder = torch.compile( + predictor.model.image_encoder, + mode=use_compile, + fullgraph=use_fullgraph, + ) + # Run first batch a few times for warmup and exclude it from the final timings + for _ in range(5): + _ = batch_runner( + predictor, batch, batch_size, pad_input_image_batch + ) + result_batch, num_datapoints, kernel_time = batch_runner( + predictor, batch, batch_size, pad_input_image_batch + ) + if result_batch is not None: + results += result_batch + # We expect a partial batch to only happens once at the end + assert not partial_batch + # Only measure timing on full batches + if num_datapoints == batch_size: + num_images += num_datapoints + num_batches += 1 + # We consistently exclude the last (512 - filtered) images + # Since batch sizes must be powers of two and less than + # or equal 512 this ensures consistent timing across varying + # batch sizes. + if num_images <= 4488: + elapsed_time += kernel_time + else: + partial_batch = True + batch_idx += 1 + + avg_ms_per_img = None + if num_images > 0: + avg_ms_per_img = elapsed_time + avg_ms_per_img = avg_ms_per_img / num_images + + return results, avg_ms_per_img, num_batches, num_images + + +def identity_runner(fn, *args, **kwargs): + return fn(*args, **kwargs) + + +def profiler_runner(path, fn, *args, **kwargs): + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + result = fn(*args, **kwargs) + prof.export_chrome_trace(path) + return result + + +def profile_top_runner(fn, *args, **kwargs): + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + result = fn(*args, **kwargs) + if torch.cuda.is_available(): + print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)) + else: + print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) + return result + + +def memory_runner(path, fn, *args, **kwargs): + print("Start memory recording") + torch.cuda.synchronize() + torch.cuda.memory._record_memory_history( + True, trace_alloc_max_entries=100000, trace_alloc_record_context=True + ) + result = fn(*args, **kwargs) + torch.cuda.synchronize() + snapshot = torch.cuda.memory._snapshot() + print("Finish memory recording") + import pickle + + with open(path, "wb") as f: + pickle.dump(snapshot, f) + # Use to convert pickle file into html + # python torch/cuda/_memory_viz.py trace_plot .pickle -o .html + return result + + +def run( + coco_root_dir, + coco_slice_name, + sam_checkpoint_base_path, + sam_model_type, + point_sampling_cache_dir, + mask_debug_out_dir, + batch_size=1, + print_header=False, + coco_category_names=None, + limit=None, + img_id=None, + use_half=None, + use_compile="False", + use_compile_decoder=False, + compress=None, + min_sqnr=None, + num_workers=0, + use_rel_pos=True, + pad_input_image_batch=True, + profile_path=None, + profile_top=False, + memory_path=None, + device="cuda", + output_json_path=None, + output_json_local=False, +): + from torch._inductor import config as inductorconfig + + inductorconfig.triton.unique_kernel_names = True + inductorconfig.epilogue_fusion = True + inductorconfig.coordinate_descent_tuning = True + inductorconfig.coordinate_descent_check_all_directions = True + inductorconfig.force_fuse_int_mm_with_mul = True + inductorconfig.use_mixed_mm = True + from torch.sparse import SparseSemiStructuredTensor + + SparseSemiStructuredTensor._FORCE_CUTLASS = False + + if use_half is not None: + if use_half == "float16": + use_half = torch.float16 + elif use_half == "bfloat16": + use_half = torch.bfloat16 + else: + raise ValueError( + "Expected one of float16 or bfloat for specified {use_half}" + ) + + # Batch size needs to be a multiple of two and at most 512. + assert math.log2(batch_size).is_integer() + assert batch_size <= 512 + + # https://github.com/facebookresearch/segment-anything/tree/main#model-checkpoints + # largest to smallest: vit_h, vit_l, vit_b + model_type_to_checkpoint = { + "vit_h": f"{sam_checkpoint_base_path}/sam_vit_h_4b8939.pth", + "vit_l": f"{sam_checkpoint_base_path}/sam_vit_l_0b3195.pth", + "vit_b": f"{sam_checkpoint_base_path}/sam_vit_b_01ec64.pth", + } + + from segment_anything_fast import SamPredictor, sam_model_registry + + checkpoint_path = model_type_to_checkpoint[sam_model_type] + sam = sam_model_registry[sam_model_type](checkpoint=checkpoint_path).to( + torch.device(device) + ) + predictor = SamPredictor(sam) + + from segment_anything_fast import tools + + tools.apply_eval_dtype_predictor(predictor, use_half) + + for block in predictor.model.image_encoder.blocks: + block.attn.use_rel_pos = use_rel_pos + + # Helper filter functions + def attn_only(mod, name): + return isinstance(mod, torch.nn.Linear) and "attn" in name + + def mlp_lin1_only(mod, name): + return isinstance(mod, torch.nn.Linear) and "lin1" in name + + def mlp_lin2_only(mod, name): + return isinstance(mod, torch.nn.Linear) and "lin2" in name + + def mlp_only(mod, name): + return isinstance(mod, torch.nn.Linear) and "mlp" in name + + if compress == "int8_dynamic_quant": + quantize_(predictor.model.image_encoder, int8_dynamic_activation_int8_weight()) + if not TORCH_VERSION_AT_LEAST_2_5: + predictor.model.image_encoder = unwrap_tensor_subclass( + predictor.model.image_encoder + ) + elif compress == "sparse_mlp_only": + + def mlp_only(mod, name): + return isinstance(mod, torch.nn.Linear) and "mlp" in name + + apply_fake_sparsity(predictor.model.image_encoder, filter_fn=mlp_only) + sparsify_( + predictor.model.image_encoder, semi_sparse_weight(), filter_fn=mlp_only + ) + elif compress == "sparse": + apply_fake_sparsity(predictor.model.image_encoder) + sparsify_(predictor.model.image_encoder, semi_sparse_weight()) + elif compress == "int8_dynamic_quant_sparse": + # apply sparsify first to set qparams + apply_fake_sparsity(predictor.model.image_encoder, filter_fn=mlp_only) + + quantize_( + predictor.model.image_encoder, + int8_dynamic_activation_int8_weight(), + attn_only, + ) + quantize_( + predictor.model.image_encoder, + int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()), + mlp_lin1_only, + ) + sparsify_(predictor.model.image_encoder, semi_sparse_weight(), mlp_lin2_only) + if not TORCH_VERSION_AT_LEAST_2_5: + predictor.model.image_encoder = unwrap_tensor_subclass( + predictor.model.image_encoder + ) + elif compress == "int4_weight_only_sparse": + # apply sparsify first to set qparams + apply_fake_sparsity(predictor.model.image_encoder, filter_fn=mlp_only) + from torchao.dtypes import MarlinSparseLayout + + quantize_( + predictor.model.image_encoder, + int8_dynamic_activation_int8_weight(), + attn_only, + ) + quantize_( + predictor.model.image_encoder, + int4_weight_only(layout=MarlinSparseLayout()), + mlp_lin1_only, + ) + sparsify_(predictor.model.image_encoder, semi_sparse_weight(), mlp_lin2_only) + if not TORCH_VERSION_AT_LEAST_2_5: + predictor.model.image_encoder = unwrap_tensor_subclass( + predictor.model.image_encoder + ) + + elif compress is not None and "autoquant_v2" in compress: + example_input = torch.randn( + 1, 3, 1024, 1024, dtype=torch.bfloat16, device=device + ) + if "autoquant_v2-int4" == compress: + autoquant_v2( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + qtensor_class_list=torchao.prototype.quantization.autoquant_v2.DEFAULT_INT4_AUTOQUANT_CLASS_LIST, + ) + elif "autoquant_v2-float8" == compress: + autoquant_v2( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + qtensor_class_list=torchao.prototype.quantization.autoquant_v2.OTHER_AUTOQUANT_CLASS_LIST, + ) + elif "autoquant_v2-all" == compress: + autoquant_v2( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + qtensor_class_list=torchao.prototype.quantization.autoquant_v2.ALL_AUTOQUANT_CLASS_LIST, + ) + else: + autoquant_v2( + predictor.model.image_encoder, example_input=example_input, manual=True + ) + + predictor.model.image_encoder(example_input) + predictor.model.image_encoder.finalize_autoquant() + + elif compress is not None and "autoquant" in compress: + example_input = torch.randn( + 1, 3, 1024, 1024, dtype=torch.bfloat16, device=device + ) + if "autoquant-int4" == compress: + autoquant( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + qtensor_class_list=torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST, + min_sqnr=min_sqnr, + ) + elif "autoquant-float8" == compress: + autoquant( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + qtensor_class_list=torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST, + min_sqnr=min_sqnr, + ) + elif "autoquant-sparse" == compress: + autoquant( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + qtensor_class_list=torchao.quantization.DEFAULT_SPARSE_AUTOQUANT_CLASS_LIST, + min_sqnr=min_sqnr, + ) + elif "autoquant-all" == compress: + autoquant( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + qtensor_class_list=torchao.quantization.ALL_AUTOQUANT_CLASS_LIST, + min_sqnr=min_sqnr, + ) + else: + autoquant( + predictor.model.image_encoder, + example_input=example_input, + manual=True, + min_sqnr=min_sqnr, + ) + predictor.model.image_encoder(example_input) + predictor.model.image_encoder.finalize_autoquant() + else: + assert compress is None, f"Unsupported compress mode {compress}" + + coco_img_ids_, cat_id_to_cat, catIds, coco = setup_coco_img_ids( + coco_root_dir, coco_slice_name, coco_category_names, img_id + ) + + coco_img_ids = [] + for imgId in coco_img_ids_: + img = coco.loadImgs(imgId)[0] + annIds = coco.getAnnIds(imgIds=img["id"], catIds=catIds, iscrowd=None) + anns = coco.loadAnns(annIds) + if len(anns) != 0: + coco_img_ids.append(imgId) + + build_batch = build_data( + coco_img_ids, + coco, + catIds, + coco_root_dir, + coco_slice_name, + point_sampling_cache_dir, + predictor, + use_half, + pad_input_image_batch, + ) + + limit = len(coco_img_ids) if limit is None else limit + batched_data_iter = torch.utils.data.DataLoader( + list(range(limit)), + batch_size=batch_size, + collate_fn=build_batch, + num_workers=num_workers, + pin_memory=False, + ) + runner = identity_runner + + if profile_path is not None: + import functools + + runner = functools.partial(profiler_runner, profile_path) + + if profile_top: + runner = profile_top_runner + + if memory_path is not None: + assert ( + use_compile != "max-autotune" + ), f"Memory path does not support {use_compile}" + import functools + + runner = functools.partial(memory_runner, memory_path) + + results, avg_ms_per_img, num_batches, num_images = runner( + build_results, + batched_data_iter, + predictor, + mask_debug_out_dir, + batch_size, + use_compile, + use_compile_decoder, + pad_input_image_batch, + compress, + ) + + results = [[r[0], r[1], r[2], r[3].item()] for r in results] + + img_s, batch_ms_batch_size = None, None + if avg_ms_per_img is not None: + img_s = 1000 / avg_ms_per_img + batch_ms_batch_size = (avg_ms_per_img * num_images) / num_batches / batch_size + + mIoU = calculate_miou(results, mask_debug_out_dir, True, cat_id_to_cat) + if torch.cuda.is_available(): + max_memory_allocated_bytes = torch.cuda.max_memory_allocated() + _, total_memory = torch.cuda.mem_get_info() + max_memory_allocated_percentage = int( + 100 * (max_memory_allocated_bytes / total_memory) + ) + max_memory_allocated_bytes = max_memory_allocated_bytes >> 20 + else: + import psutil + + total_memory = psutil.virtual_memory().total + max_memory_allocated_bytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + max_memory_allocated_percentage = int( + 100 * (max_memory_allocated_bytes / (total_memory >> 10)) + ) + max_memory_allocated_bytes = max_memory_allocated_bytes >> 10 + + with open("results.csv", "a") as f: + if print_header: + header = ",".join( + [ + "device", + "sam_model_type", + "batch_size", + "memory(MiB)", + "memory(%)", + "img_s(avg)", + "batch_ms(avg)/batch_size", + "mIoU", + "use_compile", + "use_half", + "compress", + "use_compile_decoder", + "use_rel_pos", + "pad_input_image_batch", + "num_workers", + "num_batches", + "num_images", + "profile_path", + "memory_path", + ] + ) + f.write(header + "\n") + vals = ",".join( + map( + str, + [ + device, + sam_model_type, + batch_size, + max_memory_allocated_bytes, + max_memory_allocated_percentage, + img_s, + batch_ms_batch_size, + mIoU, + use_compile, + use_half, + compress, + use_compile_decoder, + use_rel_pos, + pad_input_image_batch, + num_workers, + num_batches, + num_images, + profile_path, + memory_path, + ], + ) + ) + f.write(vals + "\n") + + if output_json_path: + headers = [ + "name", + "dtype", + "min_sqnr", + "compile", + "device", + "arch", + "metric", + "actual", + "target", + ] + name = sam_model_type + arch = get_arch_name() + dtype = compress or "noquant" + # boolean flag to indicate whether compile is used + compile = use_compile != "False" + memory_result = [ + name, + dtype, + min_sqnr, + compile, + device, + arch, + "memory(MiB)", + max_memory_allocated_bytes, + None, + ] + performance_result = [ + name, + dtype, + min_sqnr, + compile, + device, + arch, + "img_s(avg)", + img_s, + None, + ] + write_json_result = ( + write_json_result_local if output_json_local else write_json_result_ossci + ) + write_json_result(output_json_path, headers, memory_result) + write_json_result(output_json_path, headers, performance_result) + + +if __name__ == "__main__": + fire.Fire(run) diff --git a/benchmarks/models/sam/flash_4_configs.p b/benchmarks/models/sam/flash_4_configs.p new file mode 100644 index 0000000000000000000000000000000000000000..4b6e234d0d8dc9753a04c57ae001b4cedd232e1f GIT binary patch literal 219 zcmZo*nR0t@ZtV*3Sd5VU&g13M#gMfEK$&}V9K~plAsUxD{ i#lX str: + if torch.cuda.is_available(): + return torch.cuda.get_device_name() + else: + # This returns x86_64 or arm64 (for aarch64) + return platform.machine() + + +def write_json_result_ossci(output_json_path, headers, row): + """ + Write the result into JSON format, so that it can be uploaded to the benchmark database + to be displayed on OSS dashboard. The JSON format is defined at + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + + OSS CI version, that will leave many fields to be filled in by CI + """ + mapping_headers = {headers[i]: v for i, v in enumerate(row)} + record = { + "benchmark": { + "name": "TorchAO benchmark", + "mode": "inference", + "dtype": mapping_headers["dtype"], + "extra_info": { + "device": mapping_headers["device"], + "arch": mapping_headers["arch"], + "min_sqnr": mapping_headers["min_sqnr"], + # True means compile is enabled, False means eager mode + "compile": mapping_headers["compile"], + }, + }, + "model": { + "name": mapping_headers["name"], + "type": "model", + "origins": ["torchao"], + }, + "metric": { + "name": mapping_headers["metric"], + "benchmark_values": [mapping_headers["actual"]], + "target_value": mapping_headers["target"], + }, + } + + with open(f"{os.path.splitext(output_json_path)[0]}.json", "a") as f: + print(json.dumps(record), file=f) + + +def write_json_result_local(output_json_path, headers, row): + """ + Write the result into JSON format, so that it can be uploaded to the benchmark database + to be displayed on OSS dashboard. The JSON format is defined at + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + + Local version (filling in dummy values for fields that should be populated by CI) + """ + mapping_headers = {headers[i]: v for i, v in enumerate(row)} + today = datetime.date.today() + sha_hash = hashlib.sha256(str(today).encode("utf-8")).hexdigest() + first_second = datetime.datetime.combine(today, datetime.time.min) + workflow_id = int(first_second.timestamp()) + job_id = workflow_id + 1 + record = { + "timestamp": int(time.time()), + "schema_version": "v3", + "name": "devvm local benchmark", + "repo": "pytorch/ao", + "head_branch": "main", + "head_sha": sha_hash, + "workflow_id": workflow_id, + "run_attempt": 1, + "job_id": job_id, + "benchmark": { + "name": "TorchAO benchmark", + "mode": "inference", + "dtype": mapping_headers["dtype"], + "extra_info": { + "device": mapping_headers["device"], + "arch": mapping_headers["arch"], + "min_sqnr": mapping_headers["min_sqnr"], + # True means compile is enabled, False means eager mode + "compile": mapping_headers["compile"], + }, + }, + "model": { + "name": mapping_headers["name"], + "type": "model", + "origins": ["torchao"], + }, + "metric": { + "name": mapping_headers["metric"], + "benchmark_values": [mapping_headers["actual"]], + "target_value": mapping_headers["target"], + }, + } + + with open(f"{os.path.splitext(output_json_path)[0]}.json", "a") as f: + print(json.dumps(record), file=f)