From 2c622869a90d43caadf8e754ae24db9c9963158e Mon Sep 17 00:00:00 2001
From: Apurva Jain <appy@meta.com>
Date: Tue, 1 Apr 2025 13:24:43 -0700
Subject: [PATCH] Memory profiler for cuda

---
 .../microbenchmarks/benchmark_inference.py    | 154 ++++---
 .../microbenchmarks/benchmark_runner.py       |  22 +-
 .../microbenchmarks/test/benchmark_config.yml |  61 +--
 benchmarks/microbenchmarks/utils.py           | 385 +++++++++++++++---
 4 files changed, 464 insertions(+), 158 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index c084d18d3a..15d62d1386 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -20,6 +20,8 @@
     BenchmarkResult,
     clean_caches,
     create_model_and_input,
+    generate_memory_profile,
+    generate_model_profile,
     model_inference_time_in_ms,
     string_to_config,
 )
@@ -29,70 +31,92 @@
 
 def run(config: BenchmarkConfig) -> BenchmarkResult:
     """Run inference benchmarks"""
-    clean_caches()  # Clean caches
-
-    # Create output directory if it doesn't exist
-    Path(config.output_dir).mkdir(parents=True, exist_ok=True)
-
-    base_model, input_data = create_model_and_input(
-        config.model_type,
-        config.m,
-        config.k,
-        config.n,
-        high_precision_dtype=config.high_precision_dtype,
-        device=config.device,
-    )
-
-    # Use quantize_ to apply each quantization function to the model
-    m_copy = deepcopy(base_model).eval().to(config.device)
-    ao_base_config = string_to_config(
-        config.quantization,
-        config.sparsity,
-        high_precision_dtype=config.high_precision_dtype,
-    )
-
-    # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
-    is_cuda = config.device == "cuda" and torch.cuda.is_available()
-
-    if config.sparsity is not None and (
-        config.quantization is None or "baseline" in config.quantization
-    ):
-        if is_cuda:
-            print(f"Applying {config.sparsity} sparsity to model")
-            sparsify_(m_copy, ao_base_config)
+    try:
+        clean_caches()  # Clean caches
+
+        # Create output directory if it doesn't exist
+        Path(config.output_dir).mkdir(parents=True, exist_ok=True)
+
+        base_model, input_data = create_model_and_input(
+            config.model_type,
+            config.m,
+            config.k,
+            config.n,
+            high_precision_dtype=config.high_precision_dtype,
+            device=config.device,
+        )
+
+        # Use quantize_ to apply each quantization function to the model
+        m_copy = deepcopy(base_model).eval().to(config.device)
+        ao_base_config = string_to_config(
+            config.quantization,
+            config.sparsity,
+            high_precision_dtype=config.high_precision_dtype,
+        )
+
+        # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
+        is_cuda = config.device == "cuda" and torch.cuda.is_available()
+
+        if config.sparsity is not None and (
+            config.quantization is None or "baseline" in config.quantization
+        ):
+            if is_cuda:
+                print(f"Applying {config.sparsity} sparsity to model")
+                sparsify_(m_copy, ao_base_config)
+            else:
+                print(
+                    f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
+                )
+        elif config.sparsity is None and (
+            config.quantization is None or "baseline" in config.quantization
+        ):
+            pass  # No quantization or sparsity specified, do nothing
         else:
-            print(
-                f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
+            print("Quantizing model....")
+            quantize_(m_copy, ao_base_config)
+
+        if config.use_torch_compile:
+            print("Compiling model....")
+            m_copy = torch.compile(
+                m_copy, mode=config.torch_compile_mode, fullgraph=True
             )
-    elif config.sparsity is None and (
-        config.quantization is None or "baseline" in config.quantization
-    ):
-        pass  # No quantization or sparsity specified, do nothing
-    else:
-        print("Quantizing model....")
-        quantize_(m_copy, ao_base_config)
-
-    if config.use_torch_compile:
-        print("Compiling model....")
-        m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
-
-    # Run benchmarks
-    result = BenchmarkResult(config=config)
-
-    # Benchmark time to run an inference call for quantized model
-    result.model_inference_time_in_ms = model_inference_time_in_ms(
-        model=m_copy, input_data=input_data
-    )
-
-    # TODO: Benchmark time using profiler
-    # Profile dtype model evaluation
-    # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype)
-    # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json")  # Save profiling details
-
-    # TODO: Benchmark gemm time using cuda graph
-    # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs)
-
-    # TODO: Benchmark op with cuda graph
-    # time = benchmark_op_with_cuda_graph(op, args)
-
-    return result
+
+        # Run benchmarks
+        result = BenchmarkResult(config=config)
+        # Store result in model for memory profiling
+        m_copy._benchmark_result = result
+
+        # Benchmark time to run an inference call for quantized model
+        result.model_inference_time_in_ms = model_inference_time_in_ms(
+            model=m_copy, input_data=input_data
+        )
+
+        # Run profiler if enabled
+        if config.enable_profiler:
+            print("Running profiler...")
+            try:
+                result.profiler_json_path, result.perfetto_url = generate_model_profile(
+                    m_copy, input_data, config.profiler_file_name
+                )
+            except Exception as e:
+                print(f"Error running profiler: {e}")
+
+        # Run memory profiler if enabled
+        if config.enable_memory_profile:
+            print("Running memory profiler...")
+            try:
+                result.memory_profile_path, result.memory_stats = (
+                    generate_memory_profile(
+                        m_copy, input_data, config.memory_profile_file_name
+                    )
+                )
+            except Exception as e:
+                print(f"Error running memory profiler: {e}")
+
+        return result
+    except Exception as e:
+        print(f"Error in benchmark run: {e}")
+        import traceback
+
+        print(traceback.format_exc())
+        return None
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
index 7152542eec..1a60ca6b16 100644
--- a/benchmarks/microbenchmarks/benchmark_runner.py
+++ b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -164,16 +164,22 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
                 f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
             )
             result = run_inference(config)  # Pass the config object directly
-            results.append(result)
-        except Exception:
-            print(f"Error running benchmark {config.name}")
-            continue
+            if result is not None:  # Only add successful results
+                results.append(result)
+        except Exception as e:
+            import traceback
 
-    # Add results to csv
-    generate_results_csv(results, configs[0].output_dir)
+            print(f"Error running benchmark {config.name} with error: {e}")
+            print(traceback.format_exc())
+            continue
 
-    # Print results
-    print_results(results)
+    # Add results to csv if there are any
+    if results:
+        generate_results_csv(results, configs[0].output_dir)
+        # Print results
+        print_results(results)
+    else:
+        print("No benchmark results were collected. All benchmarks failed.")
 
     # TODO: Process results: Speedups:
     # 1. For different shapes for same model and quantization
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index 97a38469de..227cb90948 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -2,46 +2,51 @@
 benchmark_mode: "inference"
 quantization_config_recipe_names:
   # Will run a baseline inference for model by default, without quantization for comparison
-  - "int4wo-32"
-  - "marlin"
-sparsity_config_recipe_names:
+  # - "int4wo-32"
+  # - "marlin"
+  - "int8wo"
+# sparsity_config_recipe_names:
   # Will run a baseline inference for model by default, without sparsity for comparison
-  - "semi-sparse"
-  - "block"
+  # - "semi-sparse"
+  # - "block"
 output_dir: "benchmarks/microbenchmarks/results"
 model_params:
-  - name: "small_bf16_linear"
-    matrix_shapes:
-      - name: "custom"
-        shapes: [
-          [1024, 1024, 1024],  # [m, k, n]
-        ]
-    high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
-    torch_compile_mode: "max-autotune"
-    device: "cuda"
-    model_type: "linear"
+  # - name: "small_bf16_linear"
+  #   matrix_shapes:
+  #     - name: "custom"
+  #       shapes: [
+  #         [1024, 1024, 1024],  # [m, k, n]
+  #       ]
+  #   high_precision_dtype: "torch.bfloat16"
+  #   use_torch_compile: true
+  #   torch_compile_mode: "max-autotune"
+  #   device: "cuda"
+  #   model_type: "linear"
+  #   enable_profiler: true  # Enable profiling for this model
 
   - name: "large_bf16_ln_linear"
     matrix_shapes:
       - name: "custom"
         shapes: [
           [2048, 4096, 1024],
-          [4096, 4096, 1024]
+          # [4096, 4096, 1024]
         ]
     high_precision_dtype: "torch.bfloat16"
     use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
-    model_type: "ln_linear_sigmoid"
-
-  - name: "cpu_fp32_linear"
-    matrix_shapes:
-      - name: "custom"
-        shapes: [
-          [4096, 4096, 1024]
-        ]
-    high_precision_dtype: "torch.float32"
-    use_torch_compile: false
-    device: "cpu"
     model_type: "linear"
+    enable_profiler: true  # Enable profiling for this model
+    enable_memory_profile: true  # Enable memory profiling for this model
+
+  # - name: "cpu_fp32_linear"
+  #   matrix_shapes:
+  #     - name: "custom"
+  #       shapes: [
+  #         [4096, 4096, 1024]
+  #       ]
+  #   high_precision_dtype: "torch.float32"
+  #   use_torch_compile: false
+  #   device: "cpu"
+  #   model_type: "linear"
+  #   enable_profiler: true  # Enable profiling for this model
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index fd3db11591..624d880a03 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -4,11 +4,15 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import csv
+import json
 import os
+import subprocess
+import uuid
 from typing import Any, Dict, List, Optional
 
 import torch
 from tabulate import tabulate
+from torch.profiler import ProfilerActivity
 from torch.utils.benchmark import Timer
 
 from torchao.core.config import AOBaseConfig
@@ -50,6 +54,262 @@ def get_default_device(device: str = "cuda") -> str:
         return "cpu"
 
 
+def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]:
+    MANIFOLD_FOLDER = "perfetto_internal_traces/tree/shared_trace"
+    DEFAULT_TTL_SEC = 28 * 24 * 60 * 60
+    file_name = os.path.basename(local_path)
+    manifold_path = os.path.join(
+        MANIFOLD_FOLDER, f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}"
+    )
+    cmd = [
+        "manifold",
+        "put",
+        local_path,
+        manifold_path,
+        "--ttl",
+        str(DEFAULT_TTL_SEC),
+        "--userData",
+        "false",
+    ]
+    ret = subprocess.run(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
+    )
+    if ret.returncode == 0:
+        print("Upload trace successfully.")
+        return manifold_path
+    else:
+        print("[ERROR] Upload failed, maybe the trace file exists.")
+        return None
+
+
+def print_perfetto_ui_url(manifold_path: str) -> Optional[str]:
+    """Generate and print the Perfetto UI URL for a Manifold trace file.
+
+    Args:
+        manifold_path: Path to the trace file in Manifold
+
+    Returns:
+        The URL to the Perfetto UI or None if there was an error
+    """
+    try:
+        url = (
+            "https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html#!/?url=https://interncache-all.fbcdn.net/manifold/"
+            + manifold_path
+        )
+        print(f"The trace is accessible at:\n{url}")
+        return url
+    except Exception as e:
+        print(f"Error generating Perfetto UI URL: {e}")
+        return None
+
+
+def generate_model_profile(model, input_data, profile_file_path):
+    """Function to benchmark model evaluation with profiling.
+
+    Args:
+        model: The model to profile
+        input_data: Input data for the model
+        profile_file_path: Path to save the profiler output
+
+    Returns:
+        Tuple of (profile_file_path, perfetto_url)
+    """
+    # Create parent directory if it doesn't exist
+    os.makedirs(os.path.dirname(profile_file_path), exist_ok=True)
+
+    # Set up profiler activities based on device
+    activities = [ProfilerActivity.CPU]
+    device = next(model.parameters()).device
+    if device.type == "cuda" and torch.cuda.is_available():
+        activities.append(ProfilerActivity.CUDA)
+
+    # Run profiler with minimal settings to ensure compatibility
+    prof = torch.profiler.profile(
+        activities=activities,
+        record_shapes=True,
+        with_stack=False,  # Disable stack traces to reduce overhead
+        profile_memory=False,  # Disable memory profiling as it's not reliable across all devices
+    )
+
+    # Warm up
+    with torch.no_grad():
+        for _ in range(3):
+            _ = model(input_data)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+
+    # Profile
+    with prof:
+        with torch.no_grad():
+            _ = model(input_data)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+
+    # Save profiling details
+    prof.export_chrome_trace(profile_file_path)
+    print(f"Profile saved to: {profile_file_path}")
+
+    # Try to upload to Perfetto UI
+    perfetto_url = None
+    try:
+        manifold_path = upload_trace_file(profile_file_path)
+        if manifold_path:
+            perfetto_url = print_perfetto_ui_url(manifold_path)
+    except Exception as e:
+        print(f"Warning: Failed to upload profile to Perfetto UI: {e}")
+
+    return profile_file_path, perfetto_url
+
+
+# def visualize_memory_profile(pickle_path: str, output_html_path: Optional[str] = None) -> Optional[str]:
+#     """Visualize memory profile from pickle file using PyTorch's memory visualization tools.
+
+#     Args:
+#         pickle_path: Path to the pickle file containing memory snapshot
+#         output_html_path: Optional path to save the HTML visualization. If None, will use pickle_path with .html extension
+
+#     Returns:
+#         Path to the generated HTML file if successful, None otherwise
+#     """
+#     try:
+#         import subprocess
+#         import sys
+
+#         if output_html_path is None:
+#             output_html_path = pickle_path.replace('.pickle', '.html')
+
+#         # Get the path to PyTorch's memory visualization script
+#         pytorch_dir = os.path.dirname(os.path.dirname(torch.__file__))
+#         memory_viz_script = os.path.join(pytorch_dir, "torch", "cuda", "_memory_viz.py")
+
+#         if not os.path.exists(memory_viz_script):
+#             print(f"Warning: Memory visualization script not found at {memory_viz_script}")
+#             return None
+
+#         # Run the visualization script
+#         cmd = [
+#             sys.executable,
+#             memory_viz_script,
+#             "trace_plot",
+#             pickle_path,
+#             "-o",
+#             output_html_path
+#         ]
+
+#         result = subprocess.run(
+#             cmd,
+#             stdout=subprocess.PIPE,
+#             stderr=subprocess.PIPE,
+#             universal_newlines=True
+#         )
+
+#         if result.returncode == 0:
+#             print(f"Memory visualization saved to: {output_html_path}")
+#             return output_html_path
+#         else:
+#             print(f"Warning: Failed to generate memory visualization: {result.stderr}")
+#             return None
+
+#     except Exception as e:
+#         print(f"Warning: Failed to generate memory visualization: {e}")
+#         return None
+
+
+def generate_memory_profile(model, input_data, profile_file_path):
+    """Function to generate memory profile for model evaluation.
+
+    Args:
+        model: The model to profile
+        input_data: Input data for the model
+        profile_file_path: Path to save the memory profile output
+
+    Returns:
+        Tuple of (profile_file_path, memory_stats)
+    """
+    # Create parent directory if it doesn't exist
+    os.makedirs(os.path.dirname(profile_file_path), exist_ok=True)
+
+    device = next(model.parameters()).device
+    memory_stats = {
+        "peak_memory_allocated": 0,
+        "peak_memory_reserved": 0,
+        "total_memory_allocated": 0,
+        "total_memory_reserved": 0,
+        "memory_events": [],
+    }
+
+    if device.type == "cuda":
+        # Enable memory history recording for CUDA
+        torch.cuda.memory._record_memory_history(
+            True, trace_alloc_max_entries=250000, trace_alloc_record_context=True
+        )
+
+        # Reset CUDA memory stats
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.empty_cache()
+
+        # Warm up
+        with torch.no_grad():
+            for _ in range(3):
+                _ = model(input_data)
+                torch.cuda.synchronize()
+
+        # Profile memory
+        with torch.no_grad():
+            _ = model(input_data)
+            torch.cuda.synchronize()
+
+        # Collect memory stats
+        memory_stats.update(
+            {
+                "peak_memory_allocated": torch.cuda.max_memory_allocated()
+                / 1024**2,  # Convert to MB
+                "peak_memory_reserved": torch.cuda.max_memory_reserved() / 1024**2,
+                "total_memory_allocated": torch.cuda.memory_allocated() / 1024**2,
+                "total_memory_reserved": torch.cuda.memory_reserved() / 1024**2,
+            }
+        )
+
+        # Get detailed memory snapshot
+        snapshot = torch.cuda.memory._snapshot()
+
+        # Save memory profile as pickle file
+        pickle_path = profile_file_path.replace(".json", ".pickle")
+        with open(pickle_path, "wb") as f:
+            from pickle import dump
+
+            dump(snapshot, f)
+
+        print(f"Memory profile saved to: {pickle_path}")
+        print(
+            f"\nmemory profile {pickle_path} saved, to convert that to a usable file, use",
+            "python pytorch/torch/cuda/_memory_viz.py trace_plot <pickle file> -o <desired output name>.html",
+        )
+
+        # TODO: Generate HTML visualization
+        # html_path = visualize_memory_profile(pickle_path)
+        # if html_path:
+        #     # Set the visualization path in the result
+        #     if isinstance(model, torch.nn.Module):
+        #         result = getattr(model, '_benchmark_result', None)
+        #         if result is not None:
+        #             result.memory_visualization_path = html_path
+
+        # Disable memory history recording
+        torch.cuda.memory._record_memory_history(False)
+
+    else:
+        print("Memory profiling only works on CUDA devices")
+        # TODO: Add XPU support when available
+        return profile_file_path, memory_stats
+
+    # Save basic stats as JSON for easy access
+    with open(profile_file_path, "w") as f:
+        json.dump(memory_stats, f, indent=2)
+
+    return profile_file_path, memory_stats
+
+
 class BenchmarkConfig:
     def __init__(
         self,
@@ -84,6 +344,17 @@ def __init__(
             "name",
             f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}",
         )
+        self.enable_profiler = bool(params.get("enable_profiler", False))
+        self.enable_memory_profile = bool(params.get("enable_memory_profile", False))
+        # Create profiler directory path without leading slash
+        profiler_dir = os.path.join(self.output_dir, "profiler")
+        os.makedirs(profiler_dir, exist_ok=True)
+        self.profiler_file_name = os.path.join(
+            profiler_dir, f"{self.name}_{self.m}_{self.k}_{self.n}_profile.json"
+        )
+        self.memory_profile_file_name = os.path.join(
+            profiler_dir, f"{self.name}_{self.m}_{self.k}_{self.n}_memory_profile.json"
+        )
 
     @staticmethod
     def _parse_precision(precision_str: str) -> torch.dtype:
@@ -105,6 +376,8 @@ def to_dict(self) -> Dict[str, Any]:
             "device": self.device,
             "model_type": self.model_type,
             "output_dir": self.output_dir,
+            "enable_profiler": self.enable_profiler,
+            "enable_memory_profile": self.enable_memory_profile,
         }
 
 
@@ -116,13 +389,24 @@ def __init__(
         self.config = config
         self.output_dir = config.output_dir
         self.model_inference_time_in_ms = 0.0
+        self.profiler_json_path: Optional[str] = None
+        self.perfetto_url: Optional[str] = None
+        self.memory_profile_path: Optional[str] = None
+        self.memory_stats: Optional[Dict[str, Any]] = None
+        # self.memory_visualization_path: Optional[str] = None
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for main function"""
-        return {
+        result_dict = {
             **self.config.to_dict(),
             "model_inference_time_in_ms": self.model_inference_time_in_ms,
+            "profiler_json_path": self.profiler_json_path,
+            "perfetto_url": self.perfetto_url,
+            "memory_profile_path": self.memory_profile_path,
+            "memory_stats": self.memory_stats,
+            # "memory_visualization_path": self.memory_visualization_path,
         }
+        return result_dict
 
 
 class ToyLinearModel(torch.nn.Module):
@@ -373,6 +657,11 @@ def generate_results_csv(
         output_dir (str): Directory to save the CSV file.
         file_name (str, optional): Name of the CSV file. Defaults to "results.csv".
     """
+    # Check if results list is empty
+    if not results:
+        print("No results to save to CSV.")
+        return
+
     # Create the output directory if it doesn't exist
     os.makedirs(output_dir, exist_ok=True)
     file_path = os.path.join(output_dir, file_name)
@@ -390,68 +679,50 @@ def generate_results_csv(
 
 
 def print_results(results: List[BenchmarkResult]):
-    """Print benchmark results in a formatted table.
-
-    Args:
-        results (List[BenchmarkResult]): List of benchmark results
-    """
+    """Print results in a table format"""
     if not results:
         print("No results to display")
         return
 
-    # Extract relevant columns for display
-    display_columns = [
-        "quantization",
-        "sparsity",
-        "model_type",
-        "m",
-        "k",
-        "n",
-        "model_inference_time_in_ms",
-        "use_torch_compile",
-    ]
-
-    # Format data for tabulate
-    headers = {
-        "quantization": "Quantization",
-        "sparsity": "Sparsity",
-        "model_type": "Model Type",
-        "m": "M",
-        "k": "K",
-        "n": "N",
-        "model_inference_time_in_ms": "Time (μs)",
-        "use_torch_compile": "Compile Mode",
-    }
-
-    # Extract and format data
     table_data = []
     for result in results:
-        result_dict = result.to_dict()
-        row = []
-        for col in display_columns:
-            value = result_dict.get(col, "N/A")
-            if value is None:
-                value = "N/A"
-            if col == "model_inference_time_in_ms":
-                value = f"{value:.2f}" if isinstance(value, (int, float)) else value
-            elif col == "use_torch_compile":
-                # Show compile mode if compile is True, otherwise show False
-                value = (
-                    result_dict.get("torch_compile_mode", "default")
-                    if result_dict.get("use_torch_compile")
-                    else "False"
+        if result is None:
+            continue
+
+        row = [
+            result.config.name,
+            result.config.quantization or "baseline",
+            result.config.sparsity or "none",
+            f"{result.model_inference_time_in_ms:.2f}",
+            str(result.config.enable_profiler),
+            str(result.config.enable_memory_profile),
+        ]
+
+        # Add memory profile data if enabled
+        if result.config.enable_memory_profile:
+            if result.memory_stats:
+                row.append(
+                    f"Peak memory: {result.memory_stats['peak_memory_allocated']:.2f}MB"
                 )
-            row.append(value)
+            else:
+                row.append("Memory profiling failed")
+
         table_data.append(row)
 
-    # Print formatted table
-    print("\nBenchmark Results:")
-    print(
-        tabulate(
-            table_data,
-            headers=[headers[col] for col in display_columns],
-            tablefmt="grid",
-            floatfmt=".2f",
-        )
-    )
-    print()
+    # Define headers
+    headers = [
+        "Name",
+        "Quantization",
+        "Sparsity",
+        "Inference Time (ms)",
+        "Profiler Enabled",
+        "Memory Profiling Enabled",
+    ]
+    if any(r.config.enable_memory_profile for r in results if r is not None):
+        headers.append("Memory Profile Data")
+
+    if table_data:
+        print("\nBenchmark Results:")
+        print(tabulate(table_data, headers=headers, tablefmt="grid"))
+    else:
+        print("\nNo valid results to display")