Added CPU Memory Stats to Benchmarks (#3231)

SSYernar · facebook-github-bot · commit 5c8e5e2c719d · 2025-07-29T08:13:20.000-07:00
Summary: Pull Request resolved: #3231 * Introduced functionality to capture and report peak CPU RSS memory usage statistics in the benchmarking utilities. * CPU Resident Set Size (RSS): RSS is a measure of the amount of memory occupied by a process that is held in RAM. It includes the memory allocated for the process's code, data, and stack, as well as any dynamically allocated memory. Monitoring RSS helps in identifying memory-intensive operations and optimizing resource usage to prevent memory-related issues. * Enhanced the `BenchmarkResult` class to include CPU memory metrics alongside existing GPU metrics. * Updated relevant files to ensure compatibility with the expanded `BenchmarkResult` class. * This enhancement allows users to gain insights into peak CPU memory usage during benchmarking, aiding in the identification of memory bottlenecks. Example metrics of FBGEMM operators: | Operator | CPU Runtime | GPU Runtime | GPU Peak Memory Alloc | GPU Peak Memory Reserved | CPU Peak RSS | |---------------------------------------|-------------|-------------|-----------------------|--------------------------|--------------| | **[pytorch generic] fallback** | 4.85 ms | 2.50 ms | 1.01 GB | 1.53 GB | 1.38 GB | | **[Prod] KeyedTensor.regroup** | 8.49 ms | 2.76 ms | 1.52 GB | 2.04 GB | 1.40 GB | | **[Module] KTRegroupAsDict** | 0.52 ms | 0.72 ms | 1.01 GB | 2.04 GB | 1.40 GB | | **[2 Ops] permute_multi_embs** | 3.11 ms | 1.81 ms | 1.01 GB | 2.04 GB | 1.41 GB | | **[1 Op] KT_regroup** | 2.17 ms | 1.56 ms | 1.01 GB | 2.04 GB | 1.42 GB | | **[Old Prod] permute_pooled_embs** | 4.00 ms | 2.69 ms | 1.52 GB | 2.04 GB | 1.43 GB | | **[pytorch generic] fallback_dup** | 4.84 ms | 2.41 ms | 1.01 GB | 2.04 GB | 1.44 GB | | **[Prod] KeyedTensor.regroup_dup** | 3.86 ms | 2.57 ms | 1.01 GB | 2.04 GB | 1.44 GB | | **[Module] KTRegroupAsDict_dup** | 0.15 ms | 0.76 ms | 1.01 GB | 2.04 GB | 1.44 GB | | **[2 Ops] permute_multi_embs_dup** | 0.86 ms | 1.47 ms | 1.01 GB | 2.04 GB | 1.44 GB | | **[1 Op] KT_regroup_dup** | 1.01 ms | 1.60 ms | 1.01 GB | 2.04 GB | 1.44 GB | Reviewed By: aliafzal Differential Revision: D78860097 fbshipit-source-id: d60ab6b9886d75019fafe7ba0b2645f80922ab9a
diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py
@@ -18,6 +18,7 @@
 import json
 import logging
 import os
+import resource
 import time
 import timeit
 from dataclasses import dataclass, fields, is_dataclass, MISSING
@@ -108,14 +109,14 @@ class CompileMode(Enum):
 
 
 @dataclass
-class MemoryStats:
+class GPUMemoryStats:
     rank: int
     malloc_retries: int
     max_mem_allocated_mbs: int
     max_mem_reserved_mbs: int
 
     @classmethod
-    def for_device(cls, rank: int) -> "MemoryStats":
+    def for_device(cls, rank: int) -> "GPUMemoryStats":
         stats = torch.cuda.memory_stats(rank)
         alloc_retries = stats.get("num_alloc_retries", 0)
         max_allocated = stats.get("allocated_bytes.all.peak", 0)
@@ -131,13 +132,31 @@ def __str__(self) -> str:
         return f"Rank {self.rank}: retries={self.malloc_retries}, allocated={self.max_mem_allocated_mbs:7}mb, reserved={self.max_mem_reserved_mbs:7}mb"
 
 
+@dataclass
+class CPUMemoryStats:
+    rank: int
+    peak_rss_mbs: int
+
+    @classmethod
+    def for_process(cls, rank: int) -> "CPUMemoryStats":
+        # Peak RSS from resource.getrusage (in KB on CentOS/Linux)
+        peak_rss_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        peak_rss_mb = peak_rss_kb // 1024
+
+        return cls(rank, peak_rss_mb)
+
+    def __str__(self) -> str:
+        return f"Rank {self.rank}: CPU Memory Peak RSS: {self.peak_rss_mbs/1000:.2f} GB"
+
+
 @dataclass
 class BenchmarkResult:
     "Class for holding results of benchmark runs"
     short_name: str
     gpu_elapsed_time: torch.Tensor  # milliseconds
     cpu_elapsed_time: torch.Tensor  # milliseconds
-    mem_stats: List[MemoryStats]  # memory stats per rank
+    gpu_mem_stats: List[GPUMemoryStats]  # GPU memory stats per rank
+    cpu_mem_stats: List[CPUMemoryStats]  # CPU memory stats per rank
     rank: int = -1
 
     def __str__(self) -> str:
@@ -147,14 +166,16 @@ def __str__(self) -> str:
         cpu_runtime = (
             f"CPU Runtime (P90): {self.runtime_percentile(90, device='cpu'):.2f} ms"
         )
-        if len(self.mem_stats) == 0:
-            return f"{self.short_name: <{35}} |  {gpu_runtime} | {cpu_runtime}"
-        mem_alloc = (
-            f"Peak Memory alloc (P90): {self.max_mem_alloc_percentile(90)/1000:.2f} GB"
-        )
-        mem_reserved = f"Peak Memory reserved (P90): {self.max_mem_reserved_percentile(90)/1000:.2f} GB"
+        cpu_mem = f"CPU Peak RSS (P90): {self.cpu_mem_percentile(90)/1000:.2f} GB"
+
+        if len(self.gpu_mem_stats) == 0:
+            return (
+                f"{self.short_name: <{35}} |  {gpu_runtime} | {cpu_runtime} | {cpu_mem}"
+            )
+        mem_alloc = f"GPU Peak Memory alloc (P90): {self.max_mem_alloc_percentile(90)/1000:.2f} GB"
+        mem_reserved = f"GPU Peak Memory reserved (P90): {self.max_mem_reserved_percentile(90)/1000:.2f} GB"
         malloc_retries = f"Malloc retries (P50/P90/P100): {self.mem_retries(50)} / {self.mem_retries(90)} / {self.mem_retries(100)}"
-        return f"{self.short_name: <{35}} | {malloc_retries} | {gpu_runtime} | {cpu_runtime} | {mem_alloc} | {mem_reserved}"
+        return f"{self.short_name: <{35}} | {malloc_retries} | {gpu_runtime} | {cpu_runtime} | {mem_alloc} | {mem_reserved} | {cpu_mem}"
 
     def runtime_percentile(
         self,
@@ -199,15 +220,28 @@ def mem_retries(
 
     def _mem_percentile(
         self,
-        mem_selector: Callable[[MemoryStats], int],
+        mem_selector: Callable[[GPUMemoryStats], int],
         percentile: int = 50,
         interpolation: str = "nearest",
     ) -> torch.Tensor:
         mem_data = torch.tensor(
-            [mem_selector(mem_stat) for mem_stat in self.mem_stats], dtype=torch.float
+            [mem_selector(mem_stat) for mem_stat in self.gpu_mem_stats],
+            dtype=torch.float,
         )
         return torch.quantile(mem_data, percentile / 100.0, interpolation=interpolation)
 
+    def cpu_mem_percentile(
+        self, percentile: int = 50, interpolation: str = "nearest"
+    ) -> torch.Tensor:
+        """Return the CPU memory percentile for peak RSS."""
+        cpu_mem_data = torch.tensor(
+            [cpu_stat.peak_rss_mbs for cpu_stat in self.cpu_mem_stats],
+            dtype=torch.float,
+        )
+        return torch.quantile(
+            cpu_mem_data, percentile / 100.0, interpolation=interpolation
+        )
+
 
 class ECWrapper(torch.nn.Module):
     """
@@ -437,8 +471,11 @@ def write_report(
         qps_gpu = int(num_requests / avg_dur_s_gpu)
 
         mem_str = ""
-        for memory_stats in benchmark_res.mem_stats:
-            mem_str += f"{memory_stats}\n"
+        for gpu_memory_stats in benchmark_res.gpu_mem_stats:
+            mem_str += f"{gpu_memory_stats}\n"
+
+        for cpu_memory_stats in benchmark_res.cpu_mem_stats:
+            mem_str += f"{cpu_memory_stats}\n"
 
         report_str += (
             f"{benchmark_res.short_name:40} "
@@ -816,13 +853,16 @@ def _run_benchmark_core(
         gpu_elapsed_time = cpu_elapsed_time.clone()
 
     # Memory statistics collection
-    mem_stats: List[MemoryStats] = []
+    gpu_mem_stats: List[GPUMemoryStats] = []
+    cpu_mem_stats = [CPUMemoryStats.for_process(rank)]
+
     if device_type == "cuda":
         if rank == -1:
             for di in range(world_size):
-                mem_stats.append(MemoryStats.for_device(di))
+                gpu_mem_stats.append(GPUMemoryStats.for_device(di))
         else:
-            mem_stats.append(MemoryStats.for_device(rank))
+            gpu_mem_stats.append(GPUMemoryStats.for_device(rank))
+    # CPU memory stats are collected for both GPU and CPU-only runs
 
     # Optional detailed profiling
     if output_dir and profile_iter_fn and device_type == "cuda":
@@ -868,7 +908,8 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
         short_name=name,
         gpu_elapsed_time=gpu_elapsed_time,
         cpu_elapsed_time=cpu_elapsed_time,
-        mem_stats=mem_stats,
+        gpu_mem_stats=gpu_mem_stats,
+        cpu_mem_stats=cpu_mem_stats,
         rank=rank,
     )
 
@@ -1139,7 +1180,8 @@ def setUp() -> None:
         res = qq.get()
 
         benchmark_res_per_rank.append(res)
-        assert len(res.mem_stats) == 1
+        assert len(res.gpu_mem_stats) == 1
+        assert len(res.cpu_mem_stats) == 1
 
     for p in processes:
         p.join()
@@ -1149,13 +1191,15 @@ def setUp() -> None:
         short_name=benchmark_res_per_rank[0].short_name,
         gpu_elapsed_time=benchmark_res_per_rank[0].gpu_elapsed_time,
         cpu_elapsed_time=benchmark_res_per_rank[0].cpu_elapsed_time,
-        mem_stats=[MemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
+        gpu_mem_stats=[GPUMemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
+        cpu_mem_stats=[CPUMemoryStats(rank, 0) for rank in range(world_size)],
         rank=0,
     )
 
     for res in benchmark_res_per_rank:
-        # Each rank's BenchmarkResult contains 1 memory measurement
-        total_benchmark_res.mem_stats[res.rank] = res.mem_stats[0]
+        # Each rank's BenchmarkResult contains 1 GPU and 1 CPU memory measurement
+        total_benchmark_res.gpu_mem_stats[res.rank] = res.gpu_mem_stats[0]
+        total_benchmark_res.cpu_mem_stats[res.rank] = res.cpu_mem_stats[0]
 
     return total_benchmark_res
 
diff --git a/torchrec/sparse/tests/jagged_tensor_benchmark.py b/torchrec/sparse/tests/jagged_tensor_benchmark.py
@@ -18,7 +18,8 @@
 from torchrec.distributed.benchmark.benchmark_utils import (
     benchmark,
     BenchmarkResult,
-    MemoryStats,
+    CPUMemoryStats,
+    GPUMemoryStats,
 )
 from torchrec.modules.regroup import KTRegroupAsDict
 from torchrec.sparse.jagged_tensor import (
@@ -109,7 +110,8 @@ def wrapped_func(
             short_name=name,
             gpu_elapsed_time=torch.tensor(times) * 1e3,
             cpu_elapsed_time=torch.tensor(times) * 1e3,
-            mem_stats=[MemoryStats(0, 0, 0, 0)],
+            gpu_mem_stats=[GPUMemoryStats(0, 0, 0, 0)],
+            cpu_mem_stats=[CPUMemoryStats.for_process(0)],
         )
 
     print(
diff --git a/torchrec/sparse/tests/keyed_jagged_tensor_benchmark_lib.py b/torchrec/sparse/tests/keyed_jagged_tensor_benchmark_lib.py
@@ -20,7 +20,11 @@
 # Otherwise will get error
 # NotImplementedError: fbgemm::permute_1D_sparse_data: We could not find the abstract impl for this operator.
 from fbgemm_gpu import sparse_ops  # noqa: F401, E402
-from torchrec.distributed.benchmark.benchmark_utils import BenchmarkResult, MemoryStats
+from torchrec.distributed.benchmark.benchmark_utils import (
+    BenchmarkResult,
+    CPUMemoryStats,
+    GPUMemoryStats,
+)
 from torchrec.distributed.dist_data import _get_recat
 
 from torchrec.distributed.test_utils.test_model import ModelInput
@@ -229,7 +233,8 @@ def benchmark_kjt(
         short_name=f"{test_name}-{transform_type.name}",
         gpu_elapsed_time=torch.tensor(times),
         cpu_elapsed_time=torch.tensor(times),
-        mem_stats=[MemoryStats(0, 0, 0, 0)],
+        gpu_mem_stats=[GPUMemoryStats(0, 0, 0, 0)],
+        cpu_mem_stats=[CPUMemoryStats.for_process(0)],
     )
 
     p50_runtime = result.runtime_percentile(50, interpolation="linear").item()