From 2c622869a90d43caadf8e754ae24db9c9963158e Mon Sep 17 00:00:00 2001 From: Apurva Jain Date: Tue, 1 Apr 2025 13:24:43 -0700 Subject: [PATCH] Memory profiler for cuda --- .../microbenchmarks/benchmark_inference.py | 154 ++++--- .../microbenchmarks/benchmark_runner.py | 22 +- .../microbenchmarks/test/benchmark_config.yml | 61 +-- benchmarks/microbenchmarks/utils.py | 385 +++++++++++++++--- 4 files changed, 464 insertions(+), 158 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index c084d18d3a..15d62d1386 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -20,6 +20,8 @@ BenchmarkResult, clean_caches, create_model_and_input, + generate_memory_profile, + generate_model_profile, model_inference_time_in_ms, string_to_config, ) @@ -29,70 +31,92 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: """Run inference benchmarks""" - clean_caches() # Clean caches - - # Create output directory if it doesn't exist - Path(config.output_dir).mkdir(parents=True, exist_ok=True) - - base_model, input_data = create_model_and_input( - config.model_type, - config.m, - config.k, - config.n, - high_precision_dtype=config.high_precision_dtype, - device=config.device, - ) - - # Use quantize_ to apply each quantization function to the model - m_copy = deepcopy(base_model).eval().to(config.device) - ao_base_config = string_to_config( - config.quantization, - config.sparsity, - high_precision_dtype=config.high_precision_dtype, - ) - - # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) - is_cuda = config.device == "cuda" and torch.cuda.is_available() - - if config.sparsity is not None and ( - config.quantization is None or "baseline" in config.quantization - ): - if is_cuda: - print(f"Applying {config.sparsity} sparsity to model") - sparsify_(m_copy, ao_base_config) + try: + clean_caches() # Clean caches + + # Create output directory if it doesn't exist + Path(config.output_dir).mkdir(parents=True, exist_ok=True) + + base_model, input_data = create_model_and_input( + config.model_type, + config.m, + config.k, + config.n, + high_precision_dtype=config.high_precision_dtype, + device=config.device, + ) + + # Use quantize_ to apply each quantization function to the model + m_copy = deepcopy(base_model).eval().to(config.device) + ao_base_config = string_to_config( + config.quantization, + config.sparsity, + high_precision_dtype=config.high_precision_dtype, + ) + + # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) + is_cuda = config.device == "cuda" and torch.cuda.is_available() + + if config.sparsity is not None and ( + config.quantization is None or "baseline" in config.quantization + ): + if is_cuda: + print(f"Applying {config.sparsity} sparsity to model") + sparsify_(m_copy, ao_base_config) + else: + print( + f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" + ) + elif config.sparsity is None and ( + config.quantization is None or "baseline" in config.quantization + ): + pass # No quantization or sparsity specified, do nothing else: - print( - f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" + print("Quantizing model....") + quantize_(m_copy, ao_base_config) + + if config.use_torch_compile: + print("Compiling model....") + m_copy = torch.compile( + m_copy, mode=config.torch_compile_mode, fullgraph=True ) - elif config.sparsity is None and ( - config.quantization is None or "baseline" in config.quantization - ): - pass # No quantization or sparsity specified, do nothing - else: - print("Quantizing model....") - quantize_(m_copy, ao_base_config) - - if config.use_torch_compile: - print("Compiling model....") - m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) - - # Run benchmarks - result = BenchmarkResult(config=config) - - # Benchmark time to run an inference call for quantized model - result.model_inference_time_in_ms = model_inference_time_in_ms( - model=m_copy, input_data=input_data - ) - - # TODO: Benchmark time using profiler - # Profile dtype model evaluation - # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype) - # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details - - # TODO: Benchmark gemm time using cuda graph - # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs) - - # TODO: Benchmark op with cuda graph - # time = benchmark_op_with_cuda_graph(op, args) - - return result + + # Run benchmarks + result = BenchmarkResult(config=config) + # Store result in model for memory profiling + m_copy._benchmark_result = result + + # Benchmark time to run an inference call for quantized model + result.model_inference_time_in_ms = model_inference_time_in_ms( + model=m_copy, input_data=input_data + ) + + # Run profiler if enabled + if config.enable_profiler: + print("Running profiler...") + try: + result.profiler_json_path, result.perfetto_url = generate_model_profile( + m_copy, input_data, config.profiler_file_name + ) + except Exception as e: + print(f"Error running profiler: {e}") + + # Run memory profiler if enabled + if config.enable_memory_profile: + print("Running memory profiler...") + try: + result.memory_profile_path, result.memory_stats = ( + generate_memory_profile( + m_copy, input_data, config.memory_profile_file_name + ) + ) + except Exception as e: + print(f"Error running memory profiler: {e}") + + return result + except Exception as e: + print(f"Error in benchmark run: {e}") + import traceback + + print(traceback.format_exc()) + return None diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 7152542eec..1a60ca6b16 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -164,16 +164,22 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}" ) result = run_inference(config) # Pass the config object directly - results.append(result) - except Exception: - print(f"Error running benchmark {config.name}") - continue + if result is not None: # Only add successful results + results.append(result) + except Exception as e: + import traceback - # Add results to csv - generate_results_csv(results, configs[0].output_dir) + print(f"Error running benchmark {config.name} with error: {e}") + print(traceback.format_exc()) + continue - # Print results - print_results(results) + # Add results to csv if there are any + if results: + generate_results_csv(results, configs[0].output_dir) + # Print results + print_results(results) + else: + print("No benchmark results were collected. All benchmarks failed.") # TODO: Process results: Speedups: # 1. For different shapes for same model and quantization diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 97a38469de..227cb90948 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -2,46 +2,51 @@ benchmark_mode: "inference" quantization_config_recipe_names: # Will run a baseline inference for model by default, without quantization for comparison - - "int4wo-32" - - "marlin" -sparsity_config_recipe_names: + # - "int4wo-32" + # - "marlin" + - "int8wo" +# sparsity_config_recipe_names: # Will run a baseline inference for model by default, without sparsity for comparison - - "semi-sparse" - - "block" + # - "semi-sparse" + # - "block" output_dir: "benchmarks/microbenchmarks/results" model_params: - - name: "small_bf16_linear" - matrix_shapes: - - name: "custom" - shapes: [ - [1024, 1024, 1024], # [m, k, n] - ] - high_precision_dtype: "torch.bfloat16" - use_torch_compile: true - torch_compile_mode: "max-autotune" - device: "cuda" - model_type: "linear" + # - name: "small_bf16_linear" + # matrix_shapes: + # - name: "custom" + # shapes: [ + # [1024, 1024, 1024], # [m, k, n] + # ] + # high_precision_dtype: "torch.bfloat16" + # use_torch_compile: true + # torch_compile_mode: "max-autotune" + # device: "cuda" + # model_type: "linear" + # enable_profiler: true # Enable profiling for this model - name: "large_bf16_ln_linear" matrix_shapes: - name: "custom" shapes: [ [2048, 4096, 1024], - [4096, 4096, 1024] + # [4096, 4096, 1024] ] high_precision_dtype: "torch.bfloat16" use_torch_compile: true torch_compile_mode: "max-autotune" device: "cuda" - model_type: "ln_linear_sigmoid" - - - name: "cpu_fp32_linear" - matrix_shapes: - - name: "custom" - shapes: [ - [4096, 4096, 1024] - ] - high_precision_dtype: "torch.float32" - use_torch_compile: false - device: "cpu" model_type: "linear" + enable_profiler: true # Enable profiling for this model + enable_memory_profile: true # Enable memory profiling for this model + + # - name: "cpu_fp32_linear" + # matrix_shapes: + # - name: "custom" + # shapes: [ + # [4096, 4096, 1024] + # ] + # high_precision_dtype: "torch.float32" + # use_torch_compile: false + # device: "cpu" + # model_type: "linear" + # enable_profiler: true # Enable profiling for this model diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index fd3db11591..624d880a03 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -4,11 +4,15 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import csv +import json import os +import subprocess +import uuid from typing import Any, Dict, List, Optional import torch from tabulate import tabulate +from torch.profiler import ProfilerActivity from torch.utils.benchmark import Timer from torchao.core.config import AOBaseConfig @@ -50,6 +54,262 @@ def get_default_device(device: str = "cuda") -> str: return "cpu" +def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]: + MANIFOLD_FOLDER = "perfetto_internal_traces/tree/shared_trace" + DEFAULT_TTL_SEC = 28 * 24 * 60 * 60 + file_name = os.path.basename(local_path) + manifold_path = os.path.join( + MANIFOLD_FOLDER, f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}" + ) + cmd = [ + "manifold", + "put", + local_path, + manifold_path, + "--ttl", + str(DEFAULT_TTL_SEC), + "--userData", + "false", + ] + ret = subprocess.run( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True + ) + if ret.returncode == 0: + print("Upload trace successfully.") + return manifold_path + else: + print("[ERROR] Upload failed, maybe the trace file exists.") + return None + + +def print_perfetto_ui_url(manifold_path: str) -> Optional[str]: + """Generate and print the Perfetto UI URL for a Manifold trace file. + + Args: + manifold_path: Path to the trace file in Manifold + + Returns: + The URL to the Perfetto UI or None if there was an error + """ + try: + url = ( + "https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html#!/?url=https://interncache-all.fbcdn.net/manifold/" + + manifold_path + ) + print(f"The trace is accessible at:\n{url}") + return url + except Exception as e: + print(f"Error generating Perfetto UI URL: {e}") + return None + + +def generate_model_profile(model, input_data, profile_file_path): + """Function to benchmark model evaluation with profiling. + + Args: + model: The model to profile + input_data: Input data for the model + profile_file_path: Path to save the profiler output + + Returns: + Tuple of (profile_file_path, perfetto_url) + """ + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(profile_file_path), exist_ok=True) + + # Set up profiler activities based on device + activities = [ProfilerActivity.CPU] + device = next(model.parameters()).device + if device.type == "cuda" and torch.cuda.is_available(): + activities.append(ProfilerActivity.CUDA) + + # Run profiler with minimal settings to ensure compatibility + prof = torch.profiler.profile( + activities=activities, + record_shapes=True, + with_stack=False, # Disable stack traces to reduce overhead + profile_memory=False, # Disable memory profiling as it's not reliable across all devices + ) + + # Warm up + with torch.no_grad(): + for _ in range(3): + _ = model(input_data) + if device.type == "cuda": + torch.cuda.synchronize() + + # Profile + with prof: + with torch.no_grad(): + _ = model(input_data) + if device.type == "cuda": + torch.cuda.synchronize() + + # Save profiling details + prof.export_chrome_trace(profile_file_path) + print(f"Profile saved to: {profile_file_path}") + + # Try to upload to Perfetto UI + perfetto_url = None + try: + manifold_path = upload_trace_file(profile_file_path) + if manifold_path: + perfetto_url = print_perfetto_ui_url(manifold_path) + except Exception as e: + print(f"Warning: Failed to upload profile to Perfetto UI: {e}") + + return profile_file_path, perfetto_url + + +# def visualize_memory_profile(pickle_path: str, output_html_path: Optional[str] = None) -> Optional[str]: +# """Visualize memory profile from pickle file using PyTorch's memory visualization tools. + +# Args: +# pickle_path: Path to the pickle file containing memory snapshot +# output_html_path: Optional path to save the HTML visualization. If None, will use pickle_path with .html extension + +# Returns: +# Path to the generated HTML file if successful, None otherwise +# """ +# try: +# import subprocess +# import sys + +# if output_html_path is None: +# output_html_path = pickle_path.replace('.pickle', '.html') + +# # Get the path to PyTorch's memory visualization script +# pytorch_dir = os.path.dirname(os.path.dirname(torch.__file__)) +# memory_viz_script = os.path.join(pytorch_dir, "torch", "cuda", "_memory_viz.py") + +# if not os.path.exists(memory_viz_script): +# print(f"Warning: Memory visualization script not found at {memory_viz_script}") +# return None + +# # Run the visualization script +# cmd = [ +# sys.executable, +# memory_viz_script, +# "trace_plot", +# pickle_path, +# "-o", +# output_html_path +# ] + +# result = subprocess.run( +# cmd, +# stdout=subprocess.PIPE, +# stderr=subprocess.PIPE, +# universal_newlines=True +# ) + +# if result.returncode == 0: +# print(f"Memory visualization saved to: {output_html_path}") +# return output_html_path +# else: +# print(f"Warning: Failed to generate memory visualization: {result.stderr}") +# return None + +# except Exception as e: +# print(f"Warning: Failed to generate memory visualization: {e}") +# return None + + +def generate_memory_profile(model, input_data, profile_file_path): + """Function to generate memory profile for model evaluation. + + Args: + model: The model to profile + input_data: Input data for the model + profile_file_path: Path to save the memory profile output + + Returns: + Tuple of (profile_file_path, memory_stats) + """ + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(profile_file_path), exist_ok=True) + + device = next(model.parameters()).device + memory_stats = { + "peak_memory_allocated": 0, + "peak_memory_reserved": 0, + "total_memory_allocated": 0, + "total_memory_reserved": 0, + "memory_events": [], + } + + if device.type == "cuda": + # Enable memory history recording for CUDA + torch.cuda.memory._record_memory_history( + True, trace_alloc_max_entries=250000, trace_alloc_record_context=True + ) + + # Reset CUDA memory stats + torch.cuda.reset_peak_memory_stats() + torch.cuda.empty_cache() + + # Warm up + with torch.no_grad(): + for _ in range(3): + _ = model(input_data) + torch.cuda.synchronize() + + # Profile memory + with torch.no_grad(): + _ = model(input_data) + torch.cuda.synchronize() + + # Collect memory stats + memory_stats.update( + { + "peak_memory_allocated": torch.cuda.max_memory_allocated() + / 1024**2, # Convert to MB + "peak_memory_reserved": torch.cuda.max_memory_reserved() / 1024**2, + "total_memory_allocated": torch.cuda.memory_allocated() / 1024**2, + "total_memory_reserved": torch.cuda.memory_reserved() / 1024**2, + } + ) + + # Get detailed memory snapshot + snapshot = torch.cuda.memory._snapshot() + + # Save memory profile as pickle file + pickle_path = profile_file_path.replace(".json", ".pickle") + with open(pickle_path, "wb") as f: + from pickle import dump + + dump(snapshot, f) + + print(f"Memory profile saved to: {pickle_path}") + print( + f"\nmemory profile {pickle_path} saved, to convert that to a usable file, use", + "python pytorch/torch/cuda/_memory_viz.py trace_plot -o .html", + ) + + # TODO: Generate HTML visualization + # html_path = visualize_memory_profile(pickle_path) + # if html_path: + # # Set the visualization path in the result + # if isinstance(model, torch.nn.Module): + # result = getattr(model, '_benchmark_result', None) + # if result is not None: + # result.memory_visualization_path = html_path + + # Disable memory history recording + torch.cuda.memory._record_memory_history(False) + + else: + print("Memory profiling only works on CUDA devices") + # TODO: Add XPU support when available + return profile_file_path, memory_stats + + # Save basic stats as JSON for easy access + with open(profile_file_path, "w") as f: + json.dump(memory_stats, f, indent=2) + + return profile_file_path, memory_stats + + class BenchmarkConfig: def __init__( self, @@ -84,6 +344,17 @@ def __init__( "name", f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}", ) + self.enable_profiler = bool(params.get("enable_profiler", False)) + self.enable_memory_profile = bool(params.get("enable_memory_profile", False)) + # Create profiler directory path without leading slash + profiler_dir = os.path.join(self.output_dir, "profiler") + os.makedirs(profiler_dir, exist_ok=True) + self.profiler_file_name = os.path.join( + profiler_dir, f"{self.name}_{self.m}_{self.k}_{self.n}_profile.json" + ) + self.memory_profile_file_name = os.path.join( + profiler_dir, f"{self.name}_{self.m}_{self.k}_{self.n}_memory_profile.json" + ) @staticmethod def _parse_precision(precision_str: str) -> torch.dtype: @@ -105,6 +376,8 @@ def to_dict(self) -> Dict[str, Any]: "device": self.device, "model_type": self.model_type, "output_dir": self.output_dir, + "enable_profiler": self.enable_profiler, + "enable_memory_profile": self.enable_memory_profile, } @@ -116,13 +389,24 @@ def __init__( self.config = config self.output_dir = config.output_dir self.model_inference_time_in_ms = 0.0 + self.profiler_json_path: Optional[str] = None + self.perfetto_url: Optional[str] = None + self.memory_profile_path: Optional[str] = None + self.memory_stats: Optional[Dict[str, Any]] = None + # self.memory_visualization_path: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """Convert result to dictionary for main function""" - return { + result_dict = { **self.config.to_dict(), "model_inference_time_in_ms": self.model_inference_time_in_ms, + "profiler_json_path": self.profiler_json_path, + "perfetto_url": self.perfetto_url, + "memory_profile_path": self.memory_profile_path, + "memory_stats": self.memory_stats, + # "memory_visualization_path": self.memory_visualization_path, } + return result_dict class ToyLinearModel(torch.nn.Module): @@ -373,6 +657,11 @@ def generate_results_csv( output_dir (str): Directory to save the CSV file. file_name (str, optional): Name of the CSV file. Defaults to "results.csv". """ + # Check if results list is empty + if not results: + print("No results to save to CSV.") + return + # Create the output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) file_path = os.path.join(output_dir, file_name) @@ -390,68 +679,50 @@ def generate_results_csv( def print_results(results: List[BenchmarkResult]): - """Print benchmark results in a formatted table. - - Args: - results (List[BenchmarkResult]): List of benchmark results - """ + """Print results in a table format""" if not results: print("No results to display") return - # Extract relevant columns for display - display_columns = [ - "quantization", - "sparsity", - "model_type", - "m", - "k", - "n", - "model_inference_time_in_ms", - "use_torch_compile", - ] - - # Format data for tabulate - headers = { - "quantization": "Quantization", - "sparsity": "Sparsity", - "model_type": "Model Type", - "m": "M", - "k": "K", - "n": "N", - "model_inference_time_in_ms": "Time (μs)", - "use_torch_compile": "Compile Mode", - } - - # Extract and format data table_data = [] for result in results: - result_dict = result.to_dict() - row = [] - for col in display_columns: - value = result_dict.get(col, "N/A") - if value is None: - value = "N/A" - if col == "model_inference_time_in_ms": - value = f"{value:.2f}" if isinstance(value, (int, float)) else value - elif col == "use_torch_compile": - # Show compile mode if compile is True, otherwise show False - value = ( - result_dict.get("torch_compile_mode", "default") - if result_dict.get("use_torch_compile") - else "False" + if result is None: + continue + + row = [ + result.config.name, + result.config.quantization or "baseline", + result.config.sparsity or "none", + f"{result.model_inference_time_in_ms:.2f}", + str(result.config.enable_profiler), + str(result.config.enable_memory_profile), + ] + + # Add memory profile data if enabled + if result.config.enable_memory_profile: + if result.memory_stats: + row.append( + f"Peak memory: {result.memory_stats['peak_memory_allocated']:.2f}MB" ) - row.append(value) + else: + row.append("Memory profiling failed") + table_data.append(row) - # Print formatted table - print("\nBenchmark Results:") - print( - tabulate( - table_data, - headers=[headers[col] for col in display_columns], - tablefmt="grid", - floatfmt=".2f", - ) - ) - print() + # Define headers + headers = [ + "Name", + "Quantization", + "Sparsity", + "Inference Time (ms)", + "Profiler Enabled", + "Memory Profiling Enabled", + ] + if any(r.config.enable_memory_profile for r in results if r is not None): + headers.append("Memory Profile Data") + + if table_data: + print("\nBenchmark Results:") + print(tabulate(table_data, headers=headers, tablefmt="grid")) + else: + print("\nNo valid results to display")