From 8b22a68fb91eb229f694c90389cce562000f6e2a Mon Sep 17 00:00:00 2001 From: jainapurva Date: Fri, 4 Apr 2025 10:47:22 -0700 Subject: [PATCH 01/10] Update [ghstack-poisoned] From 04f39eff053cf18d87d74d5a3bb9355794b6421f Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 8 Apr 2025 14:50:59 -0700 Subject: [PATCH 02/10] Add profiler --- .../microbenchmarks/benchmark_inference.py | 141 ++++++++-------- .../microbenchmarks/benchmark_runner.py | 22 ++- .../microbenchmarks/test/benchmark_config.yml | 61 +++---- .../test/test_benchmark_profiler.py | 154 +++++++++++++++++ benchmarks/microbenchmarks/utils.py | 155 +++++++++++------- 5 files changed, 374 insertions(+), 159 deletions(-) create mode 100644 benchmarks/microbenchmarks/test/test_benchmark_profiler.py diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index c084d18d3a..da01053202 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -20,6 +20,7 @@ BenchmarkResult, clean_caches, create_model_and_input, + generate_model_profile, model_inference_time_in_ms, string_to_config, ) @@ -29,70 +30,80 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: """Run inference benchmarks""" - clean_caches() # Clean caches - - # Create output directory if it doesn't exist - Path(config.output_dir).mkdir(parents=True, exist_ok=True) - - base_model, input_data = create_model_and_input( - config.model_type, - config.m, - config.k, - config.n, - high_precision_dtype=config.high_precision_dtype, - device=config.device, - ) - - # Use quantize_ to apply each quantization function to the model - m_copy = deepcopy(base_model).eval().to(config.device) - ao_base_config = string_to_config( - config.quantization, - config.sparsity, - high_precision_dtype=config.high_precision_dtype, - ) - - # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) - is_cuda = config.device == "cuda" and torch.cuda.is_available() - - if config.sparsity is not None and ( - config.quantization is None or "baseline" in config.quantization - ): - if is_cuda: - print(f"Applying {config.sparsity} sparsity to model") - sparsify_(m_copy, ao_base_config) + try: + clean_caches() # Clean caches + + # Create output directory if it doesn't exist + Path(config.output_dir).mkdir(parents=True, exist_ok=True) + + base_model, input_data = create_model_and_input( + config.model_type, + config.m, + config.k, + config.n, + high_precision_dtype=config.high_precision_dtype, + device=config.device, + ) + + # Use quantize_ to apply each quantization function to the model + m_copy = deepcopy(base_model).eval().to(config.device) + ao_base_config = string_to_config( + config.quantization, + config.sparsity, + high_precision_dtype=config.high_precision_dtype, + ) + + # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) + is_cuda = config.device == "cuda" and torch.cuda.is_available() + + if config.sparsity is not None and ( + config.quantization is None or "baseline" in config.quantization + ): + if is_cuda: + print(f"Applying {config.sparsity} sparsity to model") + sparsify_(m_copy, ao_base_config) + else: + print( + f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" + ) + elif config.sparsity is None and ( + config.quantization is None or "baseline" in config.quantization + ): + pass # No quantization or sparsity specified, do nothing else: - print( - f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" + print("Quantizing model....") + quantize_(m_copy, ao_base_config) + + if config.use_torch_compile: + print("Compiling model....") + m_copy = torch.compile( + m_copy, mode=config.torch_compile_mode, fullgraph=True ) - elif config.sparsity is None and ( - config.quantization is None or "baseline" in config.quantization - ): - pass # No quantization or sparsity specified, do nothing - else: - print("Quantizing model....") - quantize_(m_copy, ao_base_config) - - if config.use_torch_compile: - print("Compiling model....") - m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) - - # Run benchmarks - result = BenchmarkResult(config=config) - - # Benchmark time to run an inference call for quantized model - result.model_inference_time_in_ms = model_inference_time_in_ms( - model=m_copy, input_data=input_data - ) - - # TODO: Benchmark time using profiler - # Profile dtype model evaluation - # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype) - # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details - - # TODO: Benchmark gemm time using cuda graph - # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs) - - # TODO: Benchmark op with cuda graph - # time = benchmark_op_with_cuda_graph(op, args) - - return result + + # Run benchmarks + result = BenchmarkResult(config=config) + # Store result in model for memory profiling + m_copy._benchmark_result = result + + # Benchmark time to run an inference call for quantized model + result.model_inference_time_in_ms = model_inference_time_in_ms( + model=m_copy, input_data=input_data + ) + + # Run profiler if enabled + if config.enable_profiler: + print("Running profiler...") + try: + result.profiler_json_path, result.perfetto_url = generate_model_profile( + m_copy, input_data, config.profiler_file_name + ) + except Exception as e: + print(f"Error running profiler: {e}") + + return result + except Exception as e: + print(f"Error in benchmark run: {e}") + import traceback + + print(traceback.format_exc()) + return None diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 7152542eec..1a60ca6b16 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -164,16 +164,22 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}" ) result = run_inference(config) # Pass the config object directly - results.append(result) - except Exception: - print(f"Error running benchmark {config.name}") - continue + if result is not None: # Only add successful results + results.append(result) + except Exception as e: + import traceback - # Add results to csv - generate_results_csv(results, configs[0].output_dir) + print(f"Error running benchmark {config.name} with error: {e}") + print(traceback.format_exc()) + continue - # Print results - print_results(results) + # Add results to csv if there are any + if results: + generate_results_csv(results, configs[0].output_dir) + # Print results + print_results(results) + else: + print("No benchmark results were collected. All benchmarks failed.") # TODO: Process results: Speedups: # 1. For different shapes for same model and quantization diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 97a38469de..227cb90948 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -2,46 +2,51 @@ benchmark_mode: "inference" quantization_config_recipe_names: # Will run a baseline inference for model by default, without quantization for comparison - - "int4wo-32" - - "marlin" -sparsity_config_recipe_names: + # - "int4wo-32" + # - "marlin" + - "int8wo" +# sparsity_config_recipe_names: # Will run a baseline inference for model by default, without sparsity for comparison - - "semi-sparse" - - "block" + # - "semi-sparse" + # - "block" output_dir: "benchmarks/microbenchmarks/results" model_params: - - name: "small_bf16_linear" - matrix_shapes: - - name: "custom" - shapes: [ - [1024, 1024, 1024], # [m, k, n] - ] - high_precision_dtype: "torch.bfloat16" - use_torch_compile: true - torch_compile_mode: "max-autotune" - device: "cuda" - model_type: "linear" + # - name: "small_bf16_linear" + # matrix_shapes: + # - name: "custom" + # shapes: [ + # [1024, 1024, 1024], # [m, k, n] + # ] + # high_precision_dtype: "torch.bfloat16" + # use_torch_compile: true + # torch_compile_mode: "max-autotune" + # device: "cuda" + # model_type: "linear" + # enable_profiler: true # Enable profiling for this model - name: "large_bf16_ln_linear" matrix_shapes: - name: "custom" shapes: [ [2048, 4096, 1024], - [4096, 4096, 1024] + # [4096, 4096, 1024] ] high_precision_dtype: "torch.bfloat16" use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" - model_type: "ln_linear_sigmoid" - - - name: "cpu_fp32_linear" - matrix_shapes: - - name: "custom" - shapes: [ - [4096, 4096, 1024] - ] - high_precision_dtype: "torch.float32" - use_torch_compile: false - device: "cpu" model_type: "linear" + enable_profiler: true # Enable profiling for this model + enable_memory_profile: true # Enable memory profiling for this model + + # - name: "cpu_fp32_linear" + # matrix_shapes: + # - name: "custom" + # shapes: [ + # [4096, 4096, 1024] + # ] + # high_precision_dtype: "torch.float32" + # use_torch_compile: false + # device: "cpu" + # model_type: "linear" + # enable_profiler: true # Enable profiling for this model diff --git a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py new file mode 100644 index 0000000000..2322b1b1c5 --- /dev/null +++ b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py @@ -0,0 +1,154 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import unittest + +import torch + +from benchmarks.microbenchmarks.utils import ( + BenchmarkConfig, + ToyLinearModel, + generate_model_profile, +) + + +class TestBenchmarkProfiler(unittest.TestCase): + def setUp(self): + self.test_dir = os.path.dirname(os.path.abspath(__file__)) + self.results_dir = os.path.join(self.test_dir, "results") + os.makedirs(self.results_dir, exist_ok=True) + + # Set up a simple model and input for testing + self.m, self.k, self.n = 1024, 1024, 1024 + self.dtype = torch.bfloat16 + self.model = ToyLinearModel(k=self.k, n=self.n, dtype=self.dtype) + self.input_data = torch.randn(1, self.k, dtype=self.dtype) + + # Move to appropriate device + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model = self.model.to(self.device) + self.input_data = self.input_data.to(self.device) + + def tearDown(self): + # Clean up any generated files + import shutil + + if os.path.exists(self.results_dir): + shutil.rmtree(self.results_dir) + + def test_profiler_enabled(self): + """Test that profiler works when enabled""" + config = BenchmarkConfig( + quantization=None, + sparsity=None, + params={ + "enable_profiler": True, + "device": self.device, + }, + shape_name="test", + shape=[self.m, self.k, self.n], + output_dir=self.results_dir, + benchmark_mode="inference", + ) + + profile_path = os.path.join( + self.results_dir, + "profiler", + f"{config.name}_{self.m}_{self.k}_{self.n}_profile.json", + ) + + # Generate profile + result_path = generate_model_profile(self.model, self.input_data, profile_path) + + # Check that profile file exists and is not empty + self.assertTrue(os.path.exists(result_path)) + self.assertGreater(os.path.getsize(result_path), 0) + + # Verify it's valid JSON + with open(result_path) as f: + profile_data = json.load(f) + self.assertIsInstance(profile_data, dict) + + def test_profiler_basic_output(self): + """Test that profiler output contains expected basic fields""" + config = BenchmarkConfig( + quantization=None, + sparsity=None, + params={ + "enable_profiler": True, + "device": self.device, + }, + shape_name="test", + shape=[self.m, self.k, self.n], + output_dir=self.results_dir, + benchmark_mode="inference", + ) + + profile_path = os.path.join( + self.results_dir, + "profiler", + f"{config.name}_{self.m}_{self.k}_{self.n}_profile.json", + ) + + result_path = generate_model_profile(self.model, self.input_data, profile_path) + + with open(result_path) as f: + data = json.load(f) + + # Check for required Chrome Trace Event format fields + self.assertIn("traceEvents", data) + self.assertTrue(isinstance(data["traceEvents"], list)) + + # Check that we have some events + self.assertGreater(len(data["traceEvents"]), 0) + + # Check event format + event = data["traceEvents"][0] + self.assertIn("name", event) + self.assertIn("ph", event) # Phase + self.assertIn("ts", event) # Timestamp + self.assertIn("pid", event) # Process ID + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_cuda_profiling(self): + """Test CUDA profiling when available""" + config = BenchmarkConfig( + quantization=None, + sparsity=None, + params={ + "enable_profiler": True, + "device": "cuda", + }, + shape_name="test", + shape=[self.m, self.k, self.n], + output_dir=self.results_dir, + benchmark_mode="inference", + ) + + profile_path = os.path.join( + self.results_dir, + "profiler", + f"{config.name}_{self.m}_{self.k}_{self.n}_profile.json", + ) + + result_path = generate_model_profile( + self.model.cuda(), self.input_data.cuda(), profile_path + ) + + with open(result_path) as f: + data = json.load(f) + + # Check for CUDA events + cuda_events = [ + event for event in data["traceEvents"] if "cuda" in event.get("name", "") + ] + self.assertGreater(len(cuda_events), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index fd3db11591..1973b57304 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -9,6 +9,7 @@ import torch from tabulate import tabulate +from torch.profiler import ProfilerActivity from torch.utils.benchmark import Timer from torchao.core.config import AOBaseConfig @@ -50,6 +51,57 @@ def get_default_device(device: str = "cuda") -> str: return "cpu" +def generate_model_profile(model, input_data, profile_file_path): + """Function to benchmark model evaluation with profiling. + + Args: + model: The model to profile + input_data: Input data for the model + profile_file_path: Path to save the profiler output + + Returns: + Tuple of (profile_file_path, perfetto_url) + """ + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(profile_file_path), exist_ok=True) + + # Set up profiler activities based on device + activities = [ProfilerActivity.CPU] + device = next(model.parameters()).device + if device.type == "cuda" and torch.cuda.is_available(): + activities.append(ProfilerActivity.CUDA) + + # Run profiler with minimal settings to ensure compatibility + prof = torch.profiler.profile( + activities=activities, + record_shapes=True, + with_stack=True, + profile_memory=True, + with_flops=True, # Excperiemntal; might be unreliable for some layers + ) + + # Warm up + with torch.no_grad(): + for _ in range(3): + _ = model(input_data) + if device.type == "cuda": + torch.cuda.synchronize() + + # Profile + with prof: + with torch.no_grad(): + for _ in range(3): + _ = model(input_data) + if device.type == "cuda": + torch.cuda.synchronize() + + # Save profiling details + prof.export_chrome_trace(profile_file_path) + print(f"Profile saved to: {profile_file_path}") + + return profile_file_path + + class BenchmarkConfig: def __init__( self, @@ -84,6 +136,14 @@ def __init__( "name", f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}", ) + self.enable_profiler = bool(params.get("enable_profiler", False)) + # Create profiler directory path without leading slash + profiler_dir = os.path.join(self.output_dir, "profiler") + os.makedirs(profiler_dir, exist_ok=True) + file_name = f"{self.name}_{self.m}_{self.k}_{self.n}_quant_{self.quantization}_sparsity_{self.sparsity}" + self.profiler_file_name = os.path.join( + profiler_dir, f"{file_name}_profile.json" + ) @staticmethod def _parse_precision(precision_str: str) -> torch.dtype: @@ -105,6 +165,7 @@ def to_dict(self) -> Dict[str, Any]: "device": self.device, "model_type": self.model_type, "output_dir": self.output_dir, + "enable_profiler": self.enable_profiler, } @@ -116,13 +177,16 @@ def __init__( self.config = config self.output_dir = config.output_dir self.model_inference_time_in_ms = 0.0 + self.profiler_json_path: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """Convert result to dictionary for main function""" - return { + result_dict = { **self.config.to_dict(), "model_inference_time_in_ms": self.model_inference_time_in_ms, + "profiler_json_path": self.profiler_json_path, } + return result_dict class ToyLinearModel(torch.nn.Module): @@ -373,6 +437,11 @@ def generate_results_csv( output_dir (str): Directory to save the CSV file. file_name (str, optional): Name of the CSV file. Defaults to "results.csv". """ + # Check if results list is empty + if len(results) == 0: + print("No results to save to CSV.") + return + # Create the output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) file_path = os.path.join(output_dir, file_name) @@ -390,68 +459,38 @@ def generate_results_csv( def print_results(results: List[BenchmarkResult]): - """Print benchmark results in a formatted table. - - Args: - results (List[BenchmarkResult]): List of benchmark results - """ + """Print results in a table format""" if not results: print("No results to display") return - # Extract relevant columns for display - display_columns = [ - "quantization", - "sparsity", - "model_type", - "m", - "k", - "n", - "model_inference_time_in_ms", - "use_torch_compile", - ] - - # Format data for tabulate - headers = { - "quantization": "Quantization", - "sparsity": "Sparsity", - "model_type": "Model Type", - "m": "M", - "k": "K", - "n": "N", - "model_inference_time_in_ms": "Time (μs)", - "use_torch_compile": "Compile Mode", - } - - # Extract and format data table_data = [] for result in results: - result_dict = result.to_dict() - row = [] - for col in display_columns: - value = result_dict.get(col, "N/A") - if value is None: - value = "N/A" - if col == "model_inference_time_in_ms": - value = f"{value:.2f}" if isinstance(value, (int, float)) else value - elif col == "use_torch_compile": - # Show compile mode if compile is True, otherwise show False - value = ( - result_dict.get("torch_compile_mode", "default") - if result_dict.get("use_torch_compile") - else "False" - ) - row.append(value) + if result is None: + continue + + row = [ + result.config.name, + result.config.quantization or "baseline", + result.config.sparsity or "none", + f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})" + f"{result.model_inference_time_in_ms:.2f}", + str(result.config.enable_profiler), + ] + table_data.append(row) - # Print formatted table - print("\nBenchmark Results:") - print( - tabulate( - table_data, - headers=[headers[col] for col in display_columns], - tablefmt="grid", - floatfmt=".2f", - ) - ) - print() + # Define headers + headers = [ + "Name", + "Quantization", + "Sparsity", + "Inference Time (ms)", + "Profiler Enabled", + ] + + if table_data: + print("\nBenchmark Results:") + print(tabulate(table_data, headers=headers, tablefmt="grid")) + else: + print("\nNo valid results to display") From 4b7ea5d4ac3bff5907973aee09ef8e590af19c6b Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 10 Apr 2025 11:31:07 -0700 Subject: [PATCH 03/10] Add support for different models and different shapes --- benchmarks/microbenchmarks/README.md | 62 ++++++- .../microbenchmarks/benchmark_inference.py | 22 ++- .../microbenchmarks/benchmark_runner.py | 46 ++++- .../microbenchmarks/test/benchmark_config.yml | 84 +++++---- .../test/test_benchmark_profiler.py | 2 +- .../test/test_benchmark_runner.py | 60 +++++++ benchmarks/microbenchmarks/test/test_utils.py | 18 +- benchmarks/microbenchmarks/utils.py | 57 +----- test/test_model_architecture.py | 30 ++++ torchao/testing/model_architectures.py | 167 ++++++++++++++++++ 10 files changed, 436 insertions(+), 112 deletions(-) create mode 100644 test/test_model_architecture.py create mode 100644 torchao/testing/model_architectures.py diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md index a95dc53755..d65b295645 100644 --- a/benchmarks/microbenchmarks/README.md +++ b/benchmarks/microbenchmarks/README.md @@ -63,7 +63,15 @@ Currently, quantization string is in same format as the one being passed in llam ### Model Types - `linear`: Simple linear layer -- `ln_linear_sigmoid`: LayerNorm + Linear + Sigmoid +- `ln_linear_`: LayerNorm + Linear + Activation, where activation can be: + - `ln_linear_sigmoid`: LayerNorm + Linear + Sigmoid + - `ln_linear_relu`: LayerNorm + Linear + ReLU + - `ln_linear_leakyrelu`: LayerNorm + Linear + LeakyReLU + - `ln_linear_relu6`: LayerNorm + Linear + ReLU6 + - `ln_linear_gelu`: LayerNorm + Linear + GELU + - `ln_linear_silu`: LayerNorm + Linear + SiLU + - `ln_linear_hardswish`: LayerNorm + Linear + Hardswish +- `transformer_block`: Transformer block with self-attention and MLP ### Device Options - `cuda`: NVIDIA GPU @@ -71,6 +79,58 @@ Currently, quantization string is in same format as the one being passed in llam - `mps`: Apple Silicon GPU - `cpu`: CPU fallback +### Shape Generation Options +- `custom`: Manually specify shapes as a list of [m, k, n] dimensions + ```yaml + matrix_shapes: + - name: "custom" + shapes: [ + [1024, 1024, 1024], # [m, k, n] + [2048, 4096, 1024] + ] + ``` + +- `llama`: Use LLaMa 2 70B single-node weight shapes (assumes fused attn.wqkv and ffn.w13) + - Generates shapes for: "attn.wqkv", "attn.w0", "ffn.w13", "ffn.w2" + ```yaml + matrix_shapes: + - name: "llama" + ``` + +- `pow2`: Generate shapes with dimensions that are powers of 2 + - Parameters: + - `min_power`: Minimum power of 2 (default: 10, which is 1024) + - `max_power`: Maximum power of 2 (default: 14, which is 16,384) + ```yaml + matrix_shapes: + - name: "pow2" + min_power: 10 # 2^10 = 1024 + max_power: 12 # 2^12 = 4096 + ``` + +- `pow2_extended`: Generate shapes with dimensions that are powers of 2 and powers of 2 + half + - Parameters: + - `min_power`: Minimum power of 2 (default: 10, which is 1024) + - `max_power`: Maximum power of 2 (default: 14, which is 16,384) + ```yaml + matrix_shapes: + - name: "pow2_extended" + min_power: 10 # Generates: 1024, 1536, 2048, 3072, etc. + max_power: 11 + ``` + +- `sweep`: Generate a sweep of shapes with different powers of 2 for M, K, N dimensions + - Parameters: + - `min_power`: Minimum power of 2 (default: 8, which is 256) + - `max_power`: Maximum power of 2 (default: 15, which is 32,768) + - Note: This generates all combinations of M, K, N dimensions, which can be a large number of shapes + ```yaml + matrix_shapes: + - name: "sweep" + min_power: 8 # 2^8 = 256 + max_power: 9 # 2^9 = 512 + ``` + ## Output Results are saved to a CSV file in the specified output directory diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index da01053202..a36041f185 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -19,13 +19,15 @@ BenchmarkConfig, BenchmarkResult, clean_caches, - create_model_and_input, generate_model_profile, model_inference_time_in_ms, string_to_config, ) from torchao.quantization import quantize_ from torchao.sparsity.sparse_api import sparsify_ +from torchao.testing.model_architectures import ( + create_model_and_input_data, +) def run(config: BenchmarkConfig) -> BenchmarkResult: @@ -36,7 +38,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: # Create output directory if it doesn't exist Path(config.output_dir).mkdir(parents=True, exist_ok=True) - base_model, input_data = create_model_and_input( + base_model, input_data = create_model_and_input_data( config.model_type, config.m, config.k, @@ -94,16 +96,12 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: if config.enable_profiler: print("Running profiler...") try: - result.profiler_json_path, result.perfetto_url = generate_model_profile( + result.profiler_json_path = generate_model_profile( m_copy, input_data, config.profiler_file_name ) - except Exception as e: - print(f"Error running profiler: {e}") - + except Exception: + print(f"Error running profiler for {config.name}") return result - except Exception as e: - print(f"Error in benchmark run: {e}") - import traceback - - print(traceback.format_exc()) - return None + except Exception: + print(f"Error in benchmark run: {config.name}") + return diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 1a60ca6b16..0c137121ac 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -48,9 +48,50 @@ def get_shapes_for_config( name = shape_config["name"] if name == "custom": shapes.extend([(name, shape) for shape in shape_config["shapes"]]) + elif name == "llama": + # LLaMa 2 70B single-node weight shapes + # assumes fused attn.wqkv and ffn.w13 + bsz, seq_len = 4, 4096 + M = bsz * seq_len + llama_shapes = { + "attn.wqkv": (M, 8192, 1280), + "attn.w0": (M, 1024, 8192), + "ffn.w13": (M, 8192, 7168), + "ffn.w2": (M, 3584, 8192), + } + shapes.extend([(f"{name}_{k}", v) for k, v in llama_shapes.items()]) + elif name == "pow2": + # Generate shapes with dimensions that are powers of 2 + min_power_of_2 = shape_config.get("min_power", 10) # 1024 + max_power_of_2 = shape_config.get("max_power", 14) # 16,384 + for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)): + val = 2**power_of_2 + shapes.append((f"{name}_{idx}", [val, val, val])) + elif name == "pow2_extended": + # Generate shapes with dimensions that are powers of 2 and powers of 2 + half + min_power_of_2 = shape_config.get("min_power", 10) # 1024 + max_power_of_2 = shape_config.get("max_power", 14) # 16,384 + for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)): + val1 = 2**power_of_2 + val2 = 2**power_of_2 + 2 ** (power_of_2 - 1) + shapes.append((f"{name}_{idx*2}", [val1, val1, val1])) + shapes.append((f"{name}_{idx*2+1}", [val2, val2, val2])) + elif name == "sweep": + # Generate a sweep of shapes with different powers of 2 for M, K, N + min_p2 = shape_config.get("min_power", 8) # 256 + max_p2 = shape_config.get("max_power", 15) # 32,768 + counter = 0 + for M_p2 in range(min_p2, max_p2 + 1): + M = 2**M_p2 + for K_p2 in range(min_p2, max_p2 + 1): + K = 2**K_p2 + for N_p2 in range(min_p2, max_p2 + 1): + N = 2**N_p2 + shapes.append((f"{name}_{counter}", [M, K, N])) + counter += 1 else: raise NotImplementedError( - f"Shape config {name} not supported. Currently only supports custom shapes." + f"Shape config {name} not supported. Supported options: custom, llama, pow2, pow2_extended, sweep." ) return shapes @@ -167,10 +208,7 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None if result is not None: # Only add successful results results.append(result) except Exception as e: - import traceback - print(f"Error running benchmark {config.name} with error: {e}") - print(traceback.format_exc()) continue # Add results to csv if there are any diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 227cb90948..f47c41435a 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -2,34 +2,22 @@ benchmark_mode: "inference" quantization_config_recipe_names: # Will run a baseline inference for model by default, without quantization for comparison - # - "int4wo-32" - # - "marlin" - "int8wo" -# sparsity_config_recipe_names: + - "int8dq" + - "float8dq" +sparsity_config_recipe_names: # Will run a baseline inference for model by default, without sparsity for comparison - # - "semi-sparse" - # - "block" + - "semi-sparse" + - "block" output_dir: "benchmarks/microbenchmarks/results" model_params: - # - name: "small_bf16_linear" - # matrix_shapes: - # - name: "custom" - # shapes: [ - # [1024, 1024, 1024], # [m, k, n] - # ] - # high_precision_dtype: "torch.bfloat16" - # use_torch_compile: true - # torch_compile_mode: "max-autotune" - # device: "cuda" - # model_type: "linear" - # enable_profiler: true # Enable profiling for this model - - - name: "large_bf16_ln_linear" + - name: "small_bf16_linear" matrix_shapes: - name: "custom" shapes: [ + [1024, 1024, 1024], # [m, k, n] [2048, 4096, 1024], - # [4096, 4096, 1024] + [4096, 4096, 1024] ] high_precision_dtype: "torch.bfloat16" use_torch_compile: true @@ -37,16 +25,48 @@ model_params: device: "cuda" model_type: "linear" enable_profiler: true # Enable profiling for this model - enable_memory_profile: true # Enable memory profiling for this model - # - name: "cpu_fp32_linear" - # matrix_shapes: - # - name: "custom" - # shapes: [ - # [4096, 4096, 1024] - # ] - # high_precision_dtype: "torch.float32" - # use_torch_compile: false - # device: "cpu" - # model_type: "linear" - # enable_profiler: true # Enable profiling for this model + - name: "ln_linear_sigmoid_cuda" + matrix_shapes: + - name: "custom" + shapes: [ + [2048, 4096, 1024], + ] + high_precision_dtype: "torch.bfloat16" + use_torch_compile: true + torch_compile_mode: "max-autotune" + device: "cuda" + model_type: "ln_linear_sigmoid" + enable_profiler: true + + - name: "bf16_transformer_block" + matrix_shapes: + - name: "custom" + shapes: [ + [2048, 4096, 1024], # For transformer_block, k is the hidden dimension + ] + high_precision_dtype: "torch.bfloat16" + use_torch_compile: true + torch_compile_mode: "max-autotune" + device: "cuda" + model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition) + enable_profiler: true + + - name: "large_bf16_ln_linear" + matrix_shapes: + - name: "llama" # Example of using LLaMa shapes + - name: "pow2" # Example of using power of 2 shapes + min_power: 10 # 1024 + max_power: 12 # 4096 + - name: "pow2_extended" # Example of using extended power of 2 shapes + min_power: 10 # 1024 + max_power: 11 # 2048 + - name: "sweep" # Example of using sweep shapes (commented out as it generates many shapes) + min_power: 8 # 256 + max_power: 9 # 512 + high_precision_dtype: "torch.bfloat16" + use_torch_compile: true + torch_compile_mode: "max-autotune" + device: "cuda" + model_type: "linear" + enable_profiler: true # Enable profiling for this model diff --git a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py index 2322b1b1c5..91bd180db1 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py @@ -12,9 +12,9 @@ from benchmarks.microbenchmarks.utils import ( BenchmarkConfig, - ToyLinearModel, generate_model_profile, ) +from torchao.testing.model_architectures import ToyLinearModel class TestBenchmarkProfiler(unittest.TestCase): diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py index a8683a1de8..7f93213a22 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_runner.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_runner.py @@ -57,12 +57,72 @@ def tearDown(self): shutil.rmtree(self.temp_dir) def test_get_shapes_for_config(self): + # Test custom shapes shapes = get_shapes_for_config( self.test_config["model_params"][0]["matrix_shapes"] ) self.assertEqual(len(shapes), 1) self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024])) + # Test llama shapes + llama_shapes = get_shapes_for_config([{"name": "llama"}]) + self.assertEqual(len(llama_shapes), 4) # 4 LLaMa shapes + self.assertTrue( + any(name.startswith("llama_attn.wqkv") for name, _ in llama_shapes) + ) + self.assertTrue( + any(name.startswith("llama_attn.w0") for name, _ in llama_shapes) + ) + self.assertTrue( + any(name.startswith("llama_ffn.w13") for name, _ in llama_shapes) + ) + self.assertTrue( + any(name.startswith("llama_ffn.w2") for name, _ in llama_shapes) + ) + + # Test pow2 shapes + pow2_shapes = get_shapes_for_config( + [{"name": "pow2", "min_power": 10, "max_power": 12}] + ) + self.assertEqual(len(pow2_shapes), 3) # 3 powers of 2 (10, 11, 12) + self.assertEqual(pow2_shapes[0], ("pow2_0", [1024, 1024, 1024])) # 2^10 + self.assertEqual(pow2_shapes[1], ("pow2_1", [2048, 2048, 2048])) # 2^11 + self.assertEqual(pow2_shapes[2], ("pow2_2", [4096, 4096, 4096])) # 2^12 + + # Test pow2_extended shapes + pow2_extended_shapes = get_shapes_for_config( + [{"name": "pow2_extended", "min_power": 10, "max_power": 11}] + ) + self.assertEqual( + len(pow2_extended_shapes), 4 + ) # 2 powers of 2, each with 2 variants + self.assertEqual( + pow2_extended_shapes[0], ("pow2_extended_0", [1024, 1024, 1024]) + ) # 2^10 + self.assertEqual( + pow2_extended_shapes[1], ("pow2_extended_1", [1536, 1536, 1536]) + ) # 2^10 + 2^9 + self.assertEqual( + pow2_extended_shapes[2], ("pow2_extended_2", [2048, 2048, 2048]) + ) # 2^11 + self.assertEqual( + pow2_extended_shapes[3], ("pow2_extended_3", [3072, 3072, 3072]) + ) # 2^11 + 2^10 + + # Test sweep shapes (limited to a small range for testing) + sweep_shapes = get_shapes_for_config( + [{"name": "sweep", "min_power": 8, "max_power": 9}] + ) + # For min_power=8, max_power=9, we should have 8 shapes (2^3 = 8 combinations) + self.assertEqual(len(sweep_shapes), 8) + # Check that all shapes have the expected format + for name, shape in sweep_shapes: + self.assertTrue(name.startswith("sweep_")) + self.assertEqual(len(shape), 3) # [M, K, N] + # Check that all dimensions are powers of 2 between 2^8 and 2^9 + for dim in shape: + self.assertTrue(dim in [256, 512]) # 2^8, 2^9 + def test_get_param_combinations(self): model_param = self.test_config["model_params"][0] shapes, params = get_param_combinations(model_param) diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py index 14f226bd7e..bb721e9e03 100644 --- a/benchmarks/microbenchmarks/test/test_utils.py +++ b/benchmarks/microbenchmarks/test/test_utils.py @@ -16,15 +16,17 @@ BlockSparseWeightConfig, Float8DynamicActivationFloat8SemiSparseWeightConfig, Int4WeightOnlyConfig, - LNLinearSigmoid, SemiSparseWeightConfig, - ToyLinearModel, clean_caches, - create_model_and_input, generate_results_csv, get_default_device, string_to_config, ) +from torchao.testing.model_architectures import ( + LNLinearActivationModel, + ToyLinearModel, + create_model_and_input_data, +) class TestUtils(unittest.TestCase): @@ -153,7 +155,7 @@ def test_toy_linear_model(self): self.assertEqual(out.dtype, torch.float32) def test_ln_linear_sigmoid(self): - model = LNLinearSigmoid(fc_dim1=64, fc_dim2=32, dtype=torch.float32) + model = LNLinearActivationModel(fc_dim1=64, fc_dim2=32, dtype=torch.float32) x = torch.randn(16, 64) out = model(x) self.assertEqual(out.shape, (16, 32)) @@ -162,9 +164,9 @@ def test_ln_linear_sigmoid(self): torch.all((out >= 0) & (out <= 1)) ) # Check sigmoid output range - def test_create_model_and_input(self): + def test_create_model_and_input_data(self): m, k, n = 16, 64, 32 - model, input_data = create_model_and_input( + model, input_data = create_model_and_input_data( model_type="linear", m=m, k=k, @@ -175,7 +177,7 @@ def test_create_model_and_input(self): self.assertIsInstance(model, ToyLinearModel) self.assertEqual(input_data.shape, (m, k)) - model, input_data = create_model_and_input( + model, input_data = create_model_and_input_data( model_type="ln_linear_sigmoid", m=m, k=k, @@ -183,7 +185,7 @@ def test_create_model_and_input(self): high_precision_dtype=torch.float32, device="cpu", ) - self.assertIsInstance(model, LNLinearSigmoid) + self.assertIsInstance(model, LNLinearActivationModel) self.assertEqual(input_data.shape, (m, k)) def test_generate_results_csv(self): diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 1973b57304..883cf264ac 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -60,7 +60,7 @@ def generate_model_profile(model, input_data, profile_file_path): profile_file_path: Path to save the profiler output Returns: - Tuple of (profile_file_path, perfetto_url) + profile_file_path """ # Create parent directory if it doesn't exist os.makedirs(os.path.dirname(profile_file_path), exist_ok=True) @@ -189,30 +189,6 @@ def to_dict(self) -> Dict[str, Any]: return result_dict -class ToyLinearModel(torch.nn.Module): - def __init__(self, k=64, n=32, dtype=torch.bfloat16): - super().__init__() - self.linear1 = torch.nn.Linear(k, n, bias=False).to(dtype) - - def forward(self, x): - x = self.linear1(x) - return x - - -class LNLinearSigmoid(torch.nn.Module): - def __init__(self, fc_dim1, fc_dim2, dtype=torch.bfloat16): - super().__init__() - self.ln = torch.nn.LayerNorm(fc_dim1, elementwise_affine=False) - self.fc = torch.nn.Linear(fc_dim1, fc_dim2, bias=False).to(dtype) - self.sigmoid = torch.nn.Sigmoid() - - def forward(self, x): - x = self.ln(x) - x = self.fc(x) - x = self.sigmoid(x) - return x - - def string_to_config( quantization: Optional[str], sparsity: Optional[str], **kwargs ) -> AOBaseConfig: @@ -383,34 +359,6 @@ def model_inference_time_in_ms(model, input_data): return res * 1e6 -def create_model_and_input( - model_type: str, - m: int, - k: int, - n: int, - high_precision_dtype: torch.dtype = torch.bfloat16, - device: str = get_default_device(), -): - """Create a model and input data for benchmarking. - - Args: - model_type (str): type of the model to be created - batch_size (int): batch size of the input data - device (str): device to run the model on - high_precision_dtype (torch.dtype): data type of the model - m, k, n (int): dimensions of the model and input data - """ - if model_type == "linear": - model = ToyLinearModel(k, n, high_precision_dtype).to(device) - input_data = torch.randn(m, k, device=device, dtype=high_precision_dtype) - elif model_type == "ln_linear_sigmoid": - model = LNLinearSigmoid(k, n, high_precision_dtype).to(device) - input_data = torch.randn(m, k, device=device, dtype=high_precision_dtype) - else: - raise ValueError(f"Unknown model type: {model_type}") - return model, input_data - - def clean_caches(): import gc @@ -473,7 +421,7 @@ def print_results(results: List[BenchmarkResult]): result.config.name, result.config.quantization or "baseline", result.config.sparsity or "none", - f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})" + f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})", f"{result.model_inference_time_in_ms:.2f}", str(result.config.enable_profiler), ] @@ -485,6 +433,7 @@ def print_results(results: List[BenchmarkResult]): "Name", "Quantization", "Sparsity", + "Shape", "Inference Time (ms)", "Profiler Enabled", ] diff --git a/test/test_model_architecture.py b/test/test_model_architecture.py new file mode 100644 index 0000000000..433473ae5e --- /dev/null +++ b/test/test_model_architecture.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from torchao.testing.model_architectures import create_model_and_input_data + + +class TestModels(unittest.TestCase): + def test_toy_linear_model(self): + model, input_data = create_model_and_input_data("linear", 10, 64, 32) + output = model(input_data) + self.assertEqual(output.shape, (10, 32)) + + def test_ln_linear_activation_model(self): + model, input_data = create_model_and_input_data("ln_linear_sigmoid", 10, 64, 32) + output = model(input_data) + self.assertEqual(output.shape, (10, 32)) + + def test_transformer_block(self): + model, input_data = create_model_and_input_data("transformer_block", 10, 64, 32) + output = model(input_data) + self.assertEqual(output.shape, (10, 16, 64)) + + +if __name__ == "__main__": + unittest.main() diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py new file mode 100644 index 0000000000..cb528e55ae --- /dev/null +++ b/torchao/testing/model_architectures.py @@ -0,0 +1,167 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +import re + +import torch +import torch.nn as nn +from torch.nn import RMSNorm + + +class ToyLinearModel(torch.nn.Module): + def __init__(self, k=64, n=32, dtype=torch.bfloat16): + super().__init__() + self.linear1 = torch.nn.Linear(k, n, bias=False).to(dtype) + + def forward(self, x): + x = self.linear1(x) + return x + + +class LNLinearActivationModel(nn.Module): + def __init__( + self, fc_dim1, fc_dim2, dtype=torch.bfloat16, activation="sigmoid", device=None + ): + super().__init__() + + activation = activation.lower() + activation_map = { + "relu": nn.ReLU(), + "sigmoid": nn.Sigmoid(), + "leakyrelu": nn.LeakyReLU(), + "relu6": nn.ReLU6(), + "gelu": nn.GELU(), + "silu": nn.SiLU(), + "hardswish": nn.Hardswish(), + } + + if activation not in activation_map: + raise ValueError(f"Unsupported activation: {activation}") + + self.ln = nn.LayerNorm(fc_dim1, elementwise_affine=False) + self.fc = nn.Linear(fc_dim1, fc_dim2, bias=False).to(dtype=dtype, device=device) + self.activation = activation_map[activation] + + def forward(self, x): + x = self.ln(x) + x = self.fc(x) + return self.activation(x) + + +class TransformerBlock(torch.nn.Module): + def __init__(self, hidden_dim, num_heads=8, mlp_ratio=4, dtype=torch.bfloat16): + super().__init__() + self.hidden_dim = hidden_dim + self.num_heads = num_heads + self.head_dim = hidden_dim // num_heads + + # Self-attention + self.qkv = torch.nn.Linear(hidden_dim, 3 * hidden_dim, bias=False).to(dtype) + self.proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=False).to(dtype) + + # MLP + self.mlp_ratio = mlp_ratio + self.mlp_hidden_dim = int(hidden_dim * mlp_ratio) + self.mlp_fc1 = torch.nn.Linear(hidden_dim, self.mlp_hidden_dim, bias=False).to( + dtype + ) + self.mlp_fc2 = torch.nn.Linear(self.mlp_hidden_dim, hidden_dim, bias=False).to( + dtype + ) + + # Layer norms + self.norm1 = RMSNorm(hidden_dim, dtype=dtype) + self.norm2 = RMSNorm(hidden_dim, dtype=dtype) + + # Activation + self.activation = torch.nn.GELU() + + def forward(self, x): + batch_size, seq_len, _ = x.shape + + # Self-attention + residual = x + x = self.norm1(x) + + # Reshape qkv projection for better memory layout + qkv = self.qkv(x) # [batch_size, seq_len, 3 * hidden_dim] + qkv = qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim) + qkv = qkv.permute( + 2, 0, 3, 1, 4 + ) # [3, batch_size, num_heads, seq_len, head_dim] + q, k, v = qkv # Each has shape [batch_size, num_heads, seq_len, head_dim] + + # Scaled dot-product attention with proper reshaping + # Reshape for better memory layout and avoid broadcasting issues + q = q.reshape(batch_size * self.num_heads, seq_len, self.head_dim) + k = k.reshape(batch_size * self.num_heads, seq_len, self.head_dim) + v = v.reshape(batch_size * self.num_heads, seq_len, self.head_dim) + + # Compute attention scores + attn = (q @ k.transpose(-2, -1)) * (1.0 / (self.head_dim**0.5)) + attn = torch.softmax(attn, dim=-1) + + # Apply attention to values + x = attn @ v # [batch_size * num_heads, seq_len, head_dim] + + # Reshape back to original dimensions + x = x.reshape(batch_size, self.num_heads, seq_len, self.head_dim) + x = x.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_dim) + + # Project back to hidden dimension + x = self.proj(x) + x = residual + x + + # MLP + residual = x + x = self.norm2(x) + x = self.mlp_fc1(x) + x = self.activation(x) + x = self.mlp_fc2(x) + x = residual + x + + return x + + +def create_model_and_input_data( + model_type: str, + m: int, + k: int, + n: int, + high_precision_dtype: torch.dtype = torch.bfloat16, + device: str = "cuda", + activation: str = "relu", +): + """Create a model and input data for benchmarking. + + Args: + model_type (str): type of the model to be created + batch_size (int): batch size of the input data + device (str): device to run the model on + high_precision_dtype (torch.dtype): data type of the model + m, k, n (int): dimensions of the model and input data + """ + if model_type == "linear": + model = ToyLinearModel(k, n, high_precision_dtype).to(device) + input_data = torch.randn(m, k, device=device, dtype=high_precision_dtype) + elif "ln_linear" in model_type: + # Extract activation type from model_type string + match = re.search(r"ln_linear_?(\w+)?", model_type) + activation = match.group(1) if match and match.group(1) else "relu" + model = LNLinearActivationModel( + k, n, high_precision_dtype, activation=activation + ).to(device) + input_data = torch.randn(m, k, device=device, dtype=high_precision_dtype) + elif model_type == "transformer_block": + # For transformer block, k is the hidden dimension + model = TransformerBlock( + k, num_heads=8, mlp_ratio=4, dtype=high_precision_dtype + ).to(device) + # Input shape for transformer is [batch_size, seq_len, hidden_dim] + input_data = torch.randn(m, 16, k, device=device, dtype=high_precision_dtype) + else: + raise ValueError(f"Unknown model type: {model_type}") + return model, input_data From 33fa3ca3d7efaedb5e96792de6e925e8ec756ba1 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 10 Apr 2025 11:36:56 -0700 Subject: [PATCH 04/10] Add ruff fixes --- benchmarks/microbenchmarks/benchmark_inference.py | 9 +++------ benchmarks/microbenchmarks/benchmark_runner.py | 3 --- benchmarks/microbenchmarks/utils.py | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index da01053202..ef54470d16 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -97,13 +97,10 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: result.profiler_json_path, result.perfetto_url = generate_model_profile( m_copy, input_data, config.profiler_file_name ) - except Exception as e: - print(f"Error running profiler: {e}") + except Exception: + print(f"Error running profiler for {config.name}") return result except Exception as e: - print(f"Error in benchmark run: {e}") - import traceback - - print(traceback.format_exc()) + print(f"Error in benchmark run: {config.name} with error: {e}") return None diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 1a60ca6b16..e38fc93819 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -167,10 +167,7 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None if result is not None: # Only add successful results results.append(result) except Exception as e: - import traceback - print(f"Error running benchmark {config.name} with error: {e}") - print(traceback.format_exc()) continue # Add results to csv if there are any diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 1973b57304..2785e4d7cb 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -77,7 +77,7 @@ def generate_model_profile(model, input_data, profile_file_path): record_shapes=True, with_stack=True, profile_memory=True, - with_flops=True, # Excperiemntal; might be unreliable for some layers + with_flops=True, # Experimental; might be unreliable for some layers ) # Warm up From 5ee6b589e6c166a3635709add6e2961fe22d87c9 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 10 Apr 2025 11:38:17 -0700 Subject: [PATCH 05/10] Updates --- .../microbenchmarks/benchmark_inference.py | 6 ++-- .../microbenchmarks/test/benchmark_config.yml | 36 ++++--------------- benchmarks/microbenchmarks/utils.py | 3 +- 3 files changed, 11 insertions(+), 34 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index ef54470d16..390359997d 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -94,11 +94,11 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: if config.enable_profiler: print("Running profiler...") try: - result.profiler_json_path, result.perfetto_url = generate_model_profile( + result.profiler_json_path = generate_model_profile( m_copy, input_data, config.profiler_file_name ) - except Exception: - print(f"Error running profiler for {config.name}") + except Exception as e: + print(f"Error running profiler for {config.name} with error: {e}") return result except Exception as e: diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 227cb90948..5ea3f5d642 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -2,34 +2,23 @@ benchmark_mode: "inference" quantization_config_recipe_names: # Will run a baseline inference for model by default, without quantization for comparison - # - "int4wo-32" - # - "marlin" - "int8wo" + - "int8dq" + - "float8dq" + - "float8wo" # sparsity_config_recipe_names: # Will run a baseline inference for model by default, without sparsity for comparison # - "semi-sparse" # - "block" output_dir: "benchmarks/microbenchmarks/results" model_params: - # - name: "small_bf16_linear" - # matrix_shapes: - # - name: "custom" - # shapes: [ - # [1024, 1024, 1024], # [m, k, n] - # ] - # high_precision_dtype: "torch.bfloat16" - # use_torch_compile: true - # torch_compile_mode: "max-autotune" - # device: "cuda" - # model_type: "linear" - # enable_profiler: true # Enable profiling for this model - - - name: "large_bf16_ln_linear" + - name: "small_bf16_linear" matrix_shapes: - name: "custom" shapes: [ + [1024, 1024, 1024], # [m, k, n] [2048, 4096, 1024], - # [4096, 4096, 1024] + [4096, 4096, 1024] ] high_precision_dtype: "torch.bfloat16" use_torch_compile: true @@ -37,16 +26,3 @@ model_params: device: "cuda" model_type: "linear" enable_profiler: true # Enable profiling for this model - enable_memory_profile: true # Enable memory profiling for this model - - # - name: "cpu_fp32_linear" - # matrix_shapes: - # - name: "custom" - # shapes: [ - # [4096, 4096, 1024] - # ] - # high_precision_dtype: "torch.float32" - # use_torch_compile: false - # device: "cpu" - # model_type: "linear" - # enable_profiler: true # Enable profiling for this model diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 2785e4d7cb..44011d92f2 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -473,7 +473,7 @@ def print_results(results: List[BenchmarkResult]): result.config.name, result.config.quantization or "baseline", result.config.sparsity or "none", - f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})" + f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})", f"{result.model_inference_time_in_ms:.2f}", str(result.config.enable_profiler), ] @@ -485,6 +485,7 @@ def print_results(results: List[BenchmarkResult]): "Name", "Quantization", "Sparsity", + "Shape", "Inference Time (ms)", "Profiler Enabled", ] From 345a00c0834c6f51e83e3c790af5cbffd8a73ae4 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 10 Apr 2025 12:40:48 -0700 Subject: [PATCH 06/10] Updates --- torchao/testing/model_architectures.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py index cb528e55ae..b42e662c6f 100644 --- a/torchao/testing/model_architectures.py +++ b/torchao/testing/model_architectures.py @@ -8,7 +8,6 @@ import torch import torch.nn as nn -from torch.nn import RMSNorm class ToyLinearModel(torch.nn.Module): @@ -73,8 +72,8 @@ def __init__(self, hidden_dim, num_heads=8, mlp_ratio=4, dtype=torch.bfloat16): ) # Layer norms - self.norm1 = RMSNorm(hidden_dim, dtype=dtype) - self.norm2 = RMSNorm(hidden_dim, dtype=dtype) + self.norm1 = nn.RMSNorm(hidden_dim, dtype=dtype) + self.norm2 = nn.RMSNorm(hidden_dim, dtype=dtype) # Activation self.activation = torch.nn.GELU() From bbcba36540a92f10477c9a7b1a3c13470426fd6b Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 10 Apr 2025 15:43:28 -0700 Subject: [PATCH 07/10] Updates --- .../microbenchmarks/test/benchmark_config.yml | 4 +- test/test_model_architecture.py | 37 ++++++++++++++++--- torchao/testing/model_architectures.py | 24 +++++++++--- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 72c6417ab0..2fc0433c36 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -8,8 +8,8 @@ quantization_config_recipe_names: - "float8wo" # sparsity_config_recipe_names: # Will run a baseline inference for model by default, without sparsity for comparison - - "semi-sparse" - - "block" + # - "semi-sparse" + # - "block" output_dir: "benchmarks/microbenchmarks/results" model_params: - name: "small_bf16_linear" diff --git a/test/test_model_architecture.py b/test/test_model_architecture.py index 433473ae5e..973939a56a 100644 --- a/test/test_model_architecture.py +++ b/test/test_model_architecture.py @@ -6,22 +6,47 @@ import unittest +import torch +from parameterized import parameterized + from torchao.testing.model_architectures import create_model_and_input_data +from torchao.utils import get_available_devices class TestModels(unittest.TestCase): - def test_toy_linear_model(self): - model, input_data = create_model_and_input_data("linear", 10, 64, 32) + @parameterized.expand([(device,) for device in get_available_devices()]) + def test_toy_linear_model(self, device): + # Skip if device is not available + if device == "cuda" and not torch.cuda.is_available(): + self.skipTest("CUDA not available") + + model, input_data = create_model_and_input_data( + "linear", 10, 64, 32, device=device + ) output = model(input_data) self.assertEqual(output.shape, (10, 32)) - def test_ln_linear_activation_model(self): - model, input_data = create_model_and_input_data("ln_linear_sigmoid", 10, 64, 32) + @parameterized.expand([(device,) for device in get_available_devices()]) + def test_ln_linear_activation_model(self, device): + # Skip if device is not available + if device == "cuda" and not torch.cuda.is_available(): + self.skipTest("CUDA not available") + + model, input_data = create_model_and_input_data( + "ln_linear_sigmoid", 10, 64, 32, device=device + ) output = model(input_data) self.assertEqual(output.shape, (10, 32)) - def test_transformer_block(self): - model, input_data = create_model_and_input_data("transformer_block", 10, 64, 32) + @parameterized.expand([(device,) for device in get_available_devices()]) + def test_transformer_block(self, device): + # Skip if device is not available + if device == "cuda" and not torch.cuda.is_available(): + self.skipTest("CUDA not available") + + model, input_data = create_model_and_input_data( + "transformer_block", 10, 64, 32, device=device + ) output = model(input_data) self.assertEqual(output.shape, (10, 16, 64)) diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py index b42e662c6f..fe087ea33f 100644 --- a/torchao/testing/model_architectures.py +++ b/torchao/testing/model_architectures.py @@ -21,9 +21,7 @@ def forward(self, x): class LNLinearActivationModel(nn.Module): - def __init__( - self, fc_dim1, fc_dim2, dtype=torch.bfloat16, activation="sigmoid", device=None - ): + def __init__(self, fc_dim1, fc_dim2, dtype=torch.bfloat16, activation="sigmoid"): super().__init__() activation = activation.lower() @@ -41,7 +39,7 @@ def __init__( raise ValueError(f"Unsupported activation: {activation}") self.ln = nn.LayerNorm(fc_dim1, elementwise_affine=False) - self.fc = nn.Linear(fc_dim1, fc_dim2, bias=False).to(dtype=dtype, device=device) + self.fc = nn.Linear(fc_dim1, fc_dim2, bias=False).to(dtype=dtype) self.activation = activation_map[activation] def forward(self, x): @@ -50,6 +48,20 @@ def forward(self, x): return self.activation(x) +class RMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-5): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = self._norm(x.float()).type_as(x) + return output * self.weight + + class TransformerBlock(torch.nn.Module): def __init__(self, hidden_dim, num_heads=8, mlp_ratio=4, dtype=torch.bfloat16): super().__init__() @@ -72,8 +84,8 @@ def __init__(self, hidden_dim, num_heads=8, mlp_ratio=4, dtype=torch.bfloat16): ) # Layer norms - self.norm1 = nn.RMSNorm(hidden_dim, dtype=dtype) - self.norm2 = nn.RMSNorm(hidden_dim, dtype=dtype) + self.norm1 = RMSNorm(hidden_dim).to(dtype) + self.norm2 = RMSNorm(hidden_dim).to(dtype) # Activation self.activation = torch.nn.GELU() From d5bdb4a4effa516f923896596c07ea41c64d7aac Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 14 Apr 2025 16:01:54 -0700 Subject: [PATCH 08/10] updates --- .../microbenchmarks/benchmark_inference.py | 4 +- benchmarks/microbenchmarks/profiler.py | 60 +++++++++++++++++++ .../test/test_benchmark_profiler.py | 4 +- benchmarks/microbenchmarks/utils.py | 52 ---------------- 4 files changed, 66 insertions(+), 54 deletions(-) create mode 100644 benchmarks/microbenchmarks/profiler.py diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index 390359997d..3af0ceb57b 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -15,12 +15,14 @@ import torch +from benchmarks.microbenchmarks.profiler import ( + generate_model_profile, +) from benchmarks.microbenchmarks.utils import ( BenchmarkConfig, BenchmarkResult, clean_caches, create_model_and_input, - generate_model_profile, model_inference_time_in_ms, string_to_config, ) diff --git a/benchmarks/microbenchmarks/profiler.py b/benchmarks/microbenchmarks/profiler.py new file mode 100644 index 0000000000..bd753e0857 --- /dev/null +++ b/benchmarks/microbenchmarks/profiler.py @@ -0,0 +1,60 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +import os + +import torch +from torch.profiler import ProfilerActivity + + +def generate_model_profile(model, input_data, profile_file_path): + """Function to benchmark model evaluation with profiling. + + Args: + model: The model to profile + input_data: Input data for the model + profile_file_path: Path to save the profiler output + + Returns: + profile_file_path + """ + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(profile_file_path), exist_ok=True) + + # Set up profiler activities based on device + activities = [ProfilerActivity.CPU] + device = next(model.parameters()).device + if device.type == "cuda" and torch.cuda.is_available(): + activities.append(ProfilerActivity.CUDA) + + # Run profiler with minimal settings to ensure compatibility + prof = torch.profiler.profile( + activities=activities, + record_shapes=True, + with_stack=True, + profile_memory=True, + with_flops=True, # Experimental; might be unreliable for some layers + ) + + # Warm up + with torch.no_grad(): + for _ in range(3): + _ = model(input_data) + if device.type == "cuda": + torch.cuda.synchronize() + + # Profile + with prof: + with torch.no_grad(): + for _ in range(3): + _ = model(input_data) + if device.type == "cuda": + torch.cuda.synchronize() + + # Save profiling details + prof.export_chrome_trace(profile_file_path) + print(f"Profile saved to: {profile_file_path}") + + return profile_file_path diff --git a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py index 2322b1b1c5..0e398b4899 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py @@ -10,10 +10,12 @@ import torch +from benchmarks.microbenchmarks.profiler import ( + generate_model_profile, +) from benchmarks.microbenchmarks.utils import ( BenchmarkConfig, ToyLinearModel, - generate_model_profile, ) diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 44011d92f2..df543bb4eb 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -9,7 +9,6 @@ import torch from tabulate import tabulate -from torch.profiler import ProfilerActivity from torch.utils.benchmark import Timer from torchao.core.config import AOBaseConfig @@ -51,57 +50,6 @@ def get_default_device(device: str = "cuda") -> str: return "cpu" -def generate_model_profile(model, input_data, profile_file_path): - """Function to benchmark model evaluation with profiling. - - Args: - model: The model to profile - input_data: Input data for the model - profile_file_path: Path to save the profiler output - - Returns: - Tuple of (profile_file_path, perfetto_url) - """ - # Create parent directory if it doesn't exist - os.makedirs(os.path.dirname(profile_file_path), exist_ok=True) - - # Set up profiler activities based on device - activities = [ProfilerActivity.CPU] - device = next(model.parameters()).device - if device.type == "cuda" and torch.cuda.is_available(): - activities.append(ProfilerActivity.CUDA) - - # Run profiler with minimal settings to ensure compatibility - prof = torch.profiler.profile( - activities=activities, - record_shapes=True, - with_stack=True, - profile_memory=True, - with_flops=True, # Experimental; might be unreliable for some layers - ) - - # Warm up - with torch.no_grad(): - for _ in range(3): - _ = model(input_data) - if device.type == "cuda": - torch.cuda.synchronize() - - # Profile - with prof: - with torch.no_grad(): - for _ in range(3): - _ = model(input_data) - if device.type == "cuda": - torch.cuda.synchronize() - - # Save profiling details - prof.export_chrome_trace(profile_file_path) - print(f"Profile saved to: {profile_file_path}") - - return profile_file_path - - class BenchmarkConfig: def __init__( self, From 784ec94a052d906079ee7842a5c6c299daab4a03 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Fri, 18 Apr 2025 11:30:25 -0700 Subject: [PATCH 09/10] Added a future todo --- torchao/testing/model_architectures.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py index fe087ea33f..f59a1271b1 100644 --- a/torchao/testing/model_architectures.py +++ b/torchao/testing/model_architectures.py @@ -10,6 +10,7 @@ import torch.nn as nn +# TODO: Refactor torchao and tests to use these models class ToyLinearModel(torch.nn.Module): def __init__(self, k=64, n=32, dtype=torch.bfloat16): super().__init__() From 8f73ebfbb5a9110a223be3b04e8a46097624e1fe Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 22 Apr 2025 18:09:56 -0700 Subject: [PATCH 10/10] Lint fixes --- benchmarks/microbenchmarks/benchmark_runner.py | 4 ++-- torchao/quantization/qat/embedding.py | 12 ++++++------ torchao/quantization/qat/linear.py | 11 +++++------ 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 0c137121ac..fbd7f08388 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -74,8 +74,8 @@ def get_shapes_for_config( for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)): val1 = 2**power_of_2 val2 = 2**power_of_2 + 2 ** (power_of_2 - 1) - shapes.append((f"{name}_{idx*2}", [val1, val1, val1])) - shapes.append((f"{name}_{idx*2+1}", [val2, val2, val2])) + shapes.append((f"{name}_{idx * 2}", [val1, val1, val1])) + shapes.append((f"{name}_{idx * 2 + 1}", [val2, val2, val2])) elif name == "sweep": # Generate a sweep of shapes with different powers of 2 for M, K, N min_p2 = shape_config.get("min_power", 8) # 256 diff --git a/torchao/quantization/qat/embedding.py b/torchao/quantization/qat/embedding.py index 02772f05f0..97b5920c7a 100644 --- a/torchao/quantization/qat/embedding.py +++ b/torchao/quantization/qat/embedding.py @@ -196,7 +196,7 @@ def convert( """ self._convert_helper(model) return model - + @staticmethod def quantize_weights( weight: torch.Tensor, @@ -207,12 +207,11 @@ def quantize_weights( Helper function to quantize weights """ (qmin, qmax) = _get_qmin_qmax(bit_width) - (s, zp) = get_group_qparams_symmetric( - weight, bit_width, group_size - ) + (s, zp) = get_group_qparams_symmetric(weight, bit_width, group_size) from torchao._executorch_ops import ( _quantized_decomposed_quantize_per_channel_group_wrapper, ) + q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper( weight, s, @@ -224,7 +223,6 @@ def quantize_weights( ) return (q_weight, s, zp) - def _convert_helper(self, module: torch.nn.Module): """ Helper function to recursively swap `Int4WeightOnlyQATEmbedding` @@ -255,7 +253,9 @@ def _convert_helper(self, module: torch.nn.Module): ) setattr(module, name, quantized_embedding) - q_weight, s, zp = self.quantize_weights(child.weight, self.bit_width, group_size) + q_weight, s, zp = self.quantize_weights( + child.weight, self.bit_width, group_size + ) # Load weights and qparams into quantized embedding quantized_embedding.weight = q_weight quantized_embedding.scale = s.to(scale_precision) diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py index ab5417fb16..d384eff2d6 100644 --- a/torchao/quantization/qat/linear.py +++ b/torchao/quantization/qat/linear.py @@ -197,7 +197,7 @@ def convert( ) -> torch.nn.Module: self._convert_qat_linear_8da4w(model) return model - + @staticmethod def quantize_weights( weight: torch.Tensor, @@ -209,9 +209,7 @@ def quantize_weights( # Load weights and qparams into quantized linear n_bit = 4 (qmin, qmax) = _get_qmin_qmax(n_bit) - (s, zp) = get_group_qparams_symmetric( - weight, n_bit, group_size - ) + (s, zp) = get_group_qparams_symmetric(weight, n_bit, group_size) from torchao._executorch_ops import ( _quantized_decomposed_quantize_per_channel_group_wrapper, ) @@ -227,7 +225,6 @@ def quantize_weights( ) return (q_weight, s, zp) - def _convert_qat_linear_8da4w(self, module: torch.nn.Module): """ Replace all `Int8DynActInt4WeightQATLinear` with `Int8DynActInt4WeightLinear`. @@ -245,7 +242,9 @@ def _convert_qat_linear_8da4w(self, module: torch.nn.Module): ) setattr(module, name, quantized_linear) - q_weight, scales, zeros = self.quantize_weights(child.weight, config.group_size) + q_weight, scales, zeros = self.quantize_weights( + child.weight, config.group_size + ) quantized_linear.weight = q_weight quantized_linear.scales = scales quantized_linear.zeros = zeros