18
18
import json
19
19
import logging
20
20
import os
21
+ import resource
21
22
import time
22
23
import timeit
23
24
from dataclasses import dataclass , fields , is_dataclass , MISSING
@@ -108,14 +109,14 @@ class CompileMode(Enum):
108
109
109
110
110
111
@dataclass
111
- class MemoryStats :
112
+ class GPUMemoryStats :
112
113
rank : int
113
114
malloc_retries : int
114
115
max_mem_allocated_mbs : int
115
116
max_mem_reserved_mbs : int
116
117
117
118
@classmethod
118
- def for_device (cls , rank : int ) -> "MemoryStats " :
119
+ def for_device (cls , rank : int ) -> "GPUMemoryStats " :
119
120
stats = torch .cuda .memory_stats (rank )
120
121
alloc_retries = stats .get ("num_alloc_retries" , 0 )
121
122
max_allocated = stats .get ("allocated_bytes.all.peak" , 0 )
@@ -131,13 +132,31 @@ def __str__(self) -> str:
131
132
return f"Rank { self .rank } : retries={ self .malloc_retries } , allocated={ self .max_mem_allocated_mbs :7} mb, reserved={ self .max_mem_reserved_mbs :7} mb"
132
133
133
134
135
+ @dataclass
136
+ class CPUMemoryStats :
137
+ rank : int
138
+ peak_rss_mbs : int
139
+
140
+ @classmethod
141
+ def for_process (cls , rank : int ) -> "CPUMemoryStats" :
142
+ # Peak RSS from resource.getrusage (in KB on CentOS/Linux)
143
+ peak_rss_kb = resource .getrusage (resource .RUSAGE_SELF ).ru_maxrss
144
+ peak_rss_mb = peak_rss_kb // 1024
145
+
146
+ return cls (rank , peak_rss_mb )
147
+
148
+ def __str__ (self ) -> str :
149
+ return f"Rank { self .rank } : CPU Memory Peak RSS: { self .peak_rss_mbs / 1000 :.2f} GB"
150
+
151
+
134
152
@dataclass
135
153
class BenchmarkResult :
136
154
"Class for holding results of benchmark runs"
137
155
short_name : str
138
156
gpu_elapsed_time : torch .Tensor # milliseconds
139
157
cpu_elapsed_time : torch .Tensor # milliseconds
140
- mem_stats : List [MemoryStats ] # memory stats per rank
158
+ gpu_mem_stats : List [GPUMemoryStats ] # GPU memory stats per rank
159
+ cpu_mem_stats : List [CPUMemoryStats ] # CPU memory stats per rank
141
160
rank : int = - 1
142
161
143
162
def __str__ (self ) -> str :
@@ -147,14 +166,16 @@ def __str__(self) -> str:
147
166
cpu_runtime = (
148
167
f"CPU Runtime (P90): { self .runtime_percentile (90 , device = 'cpu' ):.2f} ms"
149
168
)
150
- if len (self .mem_stats ) == 0 :
151
- return f"{ self .short_name : <{35 }} | { gpu_runtime } | { cpu_runtime } "
152
- mem_alloc = (
153
- f"Peak Memory alloc (P90): { self .max_mem_alloc_percentile (90 )/ 1000 :.2f} GB"
154
- )
155
- mem_reserved = f"Peak Memory reserved (P90): { self .max_mem_reserved_percentile (90 )/ 1000 :.2f} GB"
169
+ cpu_mem = f"CPU Peak RSS (P90): { self .cpu_mem_percentile (90 )/ 1000 :.2f} GB"
170
+
171
+ if len (self .gpu_mem_stats ) == 0 :
172
+ return (
173
+ f"{ self .short_name : <{35 }} | { gpu_runtime } | { cpu_runtime } | { cpu_mem } "
174
+ )
175
+ mem_alloc = f"GPU Peak Memory alloc (P90): { self .max_mem_alloc_percentile (90 )/ 1000 :.2f} GB"
176
+ mem_reserved = f"GPU Peak Memory reserved (P90): { self .max_mem_reserved_percentile (90 )/ 1000 :.2f} GB"
156
177
malloc_retries = f"Malloc retries (P50/P90/P100): { self .mem_retries (50 )} / { self .mem_retries (90 )} / { self .mem_retries (100 )} "
157
- return f"{ self .short_name : <{35 }} | { malloc_retries } | { gpu_runtime } | { cpu_runtime } | { mem_alloc } | { mem_reserved } "
178
+ return f"{ self .short_name : <{35 }} | { malloc_retries } | { gpu_runtime } | { cpu_runtime } | { mem_alloc } | { mem_reserved } | { cpu_mem } "
158
179
159
180
def runtime_percentile (
160
181
self ,
@@ -199,15 +220,28 @@ def mem_retries(
199
220
200
221
def _mem_percentile (
201
222
self ,
202
- mem_selector : Callable [[MemoryStats ], int ],
223
+ mem_selector : Callable [[GPUMemoryStats ], int ],
203
224
percentile : int = 50 ,
204
225
interpolation : str = "nearest" ,
205
226
) -> torch .Tensor :
206
227
mem_data = torch .tensor (
207
- [mem_selector (mem_stat ) for mem_stat in self .mem_stats ], dtype = torch .float
228
+ [mem_selector (mem_stat ) for mem_stat in self .gpu_mem_stats ],
229
+ dtype = torch .float ,
208
230
)
209
231
return torch .quantile (mem_data , percentile / 100.0 , interpolation = interpolation )
210
232
233
+ def cpu_mem_percentile (
234
+ self , percentile : int = 50 , interpolation : str = "nearest"
235
+ ) -> torch .Tensor :
236
+ """Return the CPU memory percentile for peak RSS."""
237
+ cpu_mem_data = torch .tensor (
238
+ [cpu_stat .peak_rss_mbs for cpu_stat in self .cpu_mem_stats ],
239
+ dtype = torch .float ,
240
+ )
241
+ return torch .quantile (
242
+ cpu_mem_data , percentile / 100.0 , interpolation = interpolation
243
+ )
244
+
211
245
212
246
class ECWrapper (torch .nn .Module ):
213
247
"""
@@ -437,8 +471,11 @@ def write_report(
437
471
qps_gpu = int (num_requests / avg_dur_s_gpu )
438
472
439
473
mem_str = ""
440
- for memory_stats in benchmark_res .mem_stats :
441
- mem_str += f"{ memory_stats } \n "
474
+ for gpu_memory_stats in benchmark_res .gpu_mem_stats :
475
+ mem_str += f"{ gpu_memory_stats } \n "
476
+
477
+ for cpu_memory_stats in benchmark_res .cpu_mem_stats :
478
+ mem_str += f"{ cpu_memory_stats } \n "
442
479
443
480
report_str += (
444
481
f"{ benchmark_res .short_name :40} "
@@ -816,13 +853,16 @@ def _run_benchmark_core(
816
853
gpu_elapsed_time = cpu_elapsed_time .clone ()
817
854
818
855
# Memory statistics collection
819
- mem_stats : List [MemoryStats ] = []
856
+ gpu_mem_stats : List [GPUMemoryStats ] = []
857
+ cpu_mem_stats = [CPUMemoryStats .for_process (rank )]
858
+
820
859
if device_type == "cuda" :
821
860
if rank == - 1 :
822
861
for di in range (world_size ):
823
- mem_stats .append (MemoryStats .for_device (di ))
862
+ gpu_mem_stats .append (GPUMemoryStats .for_device (di ))
824
863
else :
825
- mem_stats .append (MemoryStats .for_device (rank ))
864
+ gpu_mem_stats .append (GPUMemoryStats .for_device (rank ))
865
+ # CPU memory stats are collected for both GPU and CPU-only runs
826
866
827
867
# Optional detailed profiling
828
868
if output_dir and profile_iter_fn and device_type == "cuda" :
@@ -868,7 +908,8 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
868
908
short_name = name ,
869
909
gpu_elapsed_time = gpu_elapsed_time ,
870
910
cpu_elapsed_time = cpu_elapsed_time ,
871
- mem_stats = mem_stats ,
911
+ gpu_mem_stats = gpu_mem_stats ,
912
+ cpu_mem_stats = cpu_mem_stats ,
872
913
rank = rank ,
873
914
)
874
915
@@ -1139,7 +1180,8 @@ def setUp() -> None:
1139
1180
res = qq .get ()
1140
1181
1141
1182
benchmark_res_per_rank .append (res )
1142
- assert len (res .mem_stats ) == 1
1183
+ assert len (res .gpu_mem_stats ) == 1
1184
+ assert len (res .cpu_mem_stats ) == 1
1143
1185
1144
1186
for p in processes :
1145
1187
p .join ()
@@ -1149,13 +1191,15 @@ def setUp() -> None:
1149
1191
short_name = benchmark_res_per_rank [0 ].short_name ,
1150
1192
gpu_elapsed_time = benchmark_res_per_rank [0 ].gpu_elapsed_time ,
1151
1193
cpu_elapsed_time = benchmark_res_per_rank [0 ].cpu_elapsed_time ,
1152
- mem_stats = [MemoryStats (rank , 0 , 0 , 0 ) for rank in range (world_size )],
1194
+ gpu_mem_stats = [GPUMemoryStats (rank , 0 , 0 , 0 ) for rank in range (world_size )],
1195
+ cpu_mem_stats = [CPUMemoryStats (rank , 0 ) for rank in range (world_size )],
1153
1196
rank = 0 ,
1154
1197
)
1155
1198
1156
1199
for res in benchmark_res_per_rank :
1157
- # Each rank's BenchmarkResult contains 1 memory measurement
1158
- total_benchmark_res .mem_stats [res .rank ] = res .mem_stats [0 ]
1200
+ # Each rank's BenchmarkResult contains 1 GPU and 1 CPU memory measurement
1201
+ total_benchmark_res .gpu_mem_stats [res .rank ] = res .gpu_mem_stats [0 ]
1202
+ total_benchmark_res .cpu_mem_stats [res .rank ] = res .cpu_mem_stats [0 ]
1159
1203
1160
1204
return total_benchmark_res
1161
1205
0 commit comments