From 8afd10ed4b22b3cabd80184062c4ad58001bc68a Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 5 Feb 2025 20:03:19 +0800 Subject: [PATCH 1/6] Fix compile issue for Marin qqq on sm<8.0 (#1651) * fix compile guard * remove guard on header file --- .../csrc/cuda/marlin_qqq/marlin_qqq_kernel.cu | 55 ++++--------------- 1 file changed, 10 insertions(+), 45 deletions(-) diff --git a/torchao/csrc/cuda/marlin_qqq/marlin_qqq_kernel.cu b/torchao/csrc/cuda/marlin_qqq/marlin_qqq_kernel.cu index 7380f9aff2..10c3f152bd 100644 --- a/torchao/csrc/cuda/marlin_qqq/marlin_qqq_kernel.cu +++ b/torchao/csrc/cuda/marlin_qqq/marlin_qqq_kernel.cu @@ -30,9 +30,7 @@ #include #include "base.h" -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include "mem.h" -#endif +#include "mem.h" template inline std::string str(T x) { @@ -41,8 +39,6 @@ inline std::string str(T x) { namespace torchao { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - using I4 = Vec; // Matrix fragments for tensor core instructions; their precise layout is // documented here: @@ -208,6 +204,8 @@ __global__ void Marlin_QQQ( int prob_k, // reduction dimension k int* locks // extra global storage for barrier synchronization ) { + // host code or device code with SM >= 80. Marlin only supports SM >= 80. +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // Each threadblock processes one "stripe" of the B matrix with (roughly) the // same size, which might involve multiple column "slices" (of width 16 * // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM @@ -855,47 +853,8 @@ __global__ void Marlin_QQQ( } } } -} - -#else - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin_QQQ( - const int4* __restrict__ A, // int8 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // int32 global_reduce buffer of shape - // (max_par*16*4)xn, as int8 tensor core's output is - // int32 dtype - int4* __restrict__ D, // fp16 output buffer of shape mxn - const float* __restrict__ s_tok, // fp32 activation per-token quantization - // scales of shape mx1 - const int4* __restrict__ s_ch, // fp32 weight per-channel quantization - // scales of shape 1xn - const int4* __restrict__ s_group, // fp16 weight per-group quantization - // scales of shape (k/groupsize)xn, when - // group_blocks=-1, it should be nullptr - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Marlin is not implemented yet for SM < 8.0 - TORCH_CHECK_NOT_IMPLEMENTED( - false, "marlin_qqq_gemm(..) requires CUDA_ARCH >= 8.0"); - return; -} - #endif +} // 8 warps are a good choice since every SM has 4 schedulers and having more // than 1 warp per schedule allows some more latency hiding. At the same time, @@ -1132,6 +1091,12 @@ torch::Tensor marlin_qqq_gemm(torch::Tensor const& a, torch::Tensor const& s_group, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k) { + const auto dprops = at::cuda::getCurrentDeviceProperties(); + if (dprops->major < 8) { + TORCH_CHECK(false, __func__, "requires SM >= 8.0. Current device is SM", + dprops->major, ".", dprops->minor); + } + // Verify M TORCH_CHECK(size_m == a.size(0), "Shape mismatch: a.size(0) = " + str(a.size(0)) + From 8d14f0eec2fade8194c7a4767ac4ba96bfd2dd2e Mon Sep 17 00:00:00 2001 From: cpuhrsch Date: Wed, 5 Feb 2025 13:27:29 -0800 Subject: [PATCH 2/6] SAM2: more export, small perf improvements (#1673) --- .../sam2_amg_server/compile_export_utils.py | 219 +++++++++++++++--- examples/sam2_amg_server/generate_data.py | 54 ++++- .../sam2_amg_server/reproduce_experiments.py | 2 +- examples/sam2_amg_server/result.csv | 140 +++++------ .../_models/sam2/automatic_mask_generator.py | 10 +- .../sam2/modeling/sam/prompt_encoder.py | 6 + torchao/_models/sam2/sam2_image_predictor.py | 17 +- torchao/_models/sam2/utils/transforms.py | 9 +- 8 files changed, 326 insertions(+), 131 deletions(-) diff --git a/examples/sam2_amg_server/compile_export_utils.py b/examples/sam2_amg_server/compile_export_utils.py index a8f34b0943..5903f4905e 100644 --- a/examples/sam2_amg_server/compile_export_utils.py +++ b/examples/sam2_amg_server/compile_export_utils.py @@ -48,7 +48,6 @@ def forward( boxes: Optional[torch.Tensor] = None, mask_input: Optional[torch.Tensor] = None, multimask_output: bool = True, - img_idx: int = -1, ): assert high_res_feats[0].size() == (self.batch_size, 32, 256, 256) assert high_res_feats[1].size() == (self.batch_size, 64, 128, 128) @@ -73,7 +72,6 @@ def forward( assert boxes is None assert mask_input is None assert multimask_output - assert img_idx == -1 if self.predictor is None: assert self.aoti_compiled_model is not None return self.aoti_compiled_model( @@ -85,7 +83,6 @@ def forward( boxes=boxes, mask_input=mask_input, multimask_output=multimask_output, - img_idx=img_idx, ) return self.predictor._predict_masks( high_res_feats, @@ -96,7 +93,6 @@ def forward( boxes=boxes, mask_input=mask_input, multimask_output=multimask_output, - img_idx=img_idx, ) @@ -176,10 +172,137 @@ def export_model( overwrite=overwrite, ) - print(f"{task_type} cannot export _predict_masks") - return + if task_type in []: + example_input_args = () + example_input_kwargs = { + "points": ( + torch.randn( + points_per_batch, + 1, + 2, + dtype=torch.float32, + device=mask_generator.predictor.device, + ), + torch.ones( + points_per_batch, + 1, + dtype=torch.int32, + device=mask_generator.predictor.device, + ), + ), + "boxes": None, + "masks": None, + } + aot_compile( + model_directory, + "sam2_sam_prompt_encoder", + mask_generator.predictor.model.sam_prompt_encoder, + example_input_args, + sample_kwargs=example_input_kwargs, + overwrite=overwrite, + ) + + if task_type in []: + example_input_args = () + example_input_kwargs = { + "image_embeddings": torch.randn( + batch_size, + 256, + 64, + 64, + dtype=torch.float32, + device=mask_generator.predictor.device, + ), + "image_pe": torch.randn( + batch_size, + 256, + 64, + 64, + dtype=torch.float32, + device=mask_generator.predictor.device, + ), + "sparse_prompt_embeddings": torch.randn( + batch_size, + 2, + 256, + dtype=torch.float32, + device=mask_generator.predictor.device, + ), + "dense_prompt_embeddings": torch.randn( + batch_size, + 256, + 64, + 64, + dtype=torch.float32, + device=mask_generator.predictor.device, + ), + "multimask_output": True, + "repeat_image": False, + "high_res_features": [ + torch.randn( + batch_size, + 32, + 256, + 256, + dtype=mask_generator.predictor._image_dtype, + device=mask_generator.predictor.device, + ), + torch.randn( + batch_size, + 64, + 128, + 128, + dtype=mask_generator.predictor._image_dtype, + device=mask_generator.predictor.device, + ), + ], + } + aot_compile( + model_directory, + "sam2_sam_mask_decoder", + mask_generator.predictor.model.sam_mask_decoder, + example_input_args, + sample_kwargs=example_input_kwargs, + overwrite=overwrite, + ) + + if task_type in []: + example_input_args = ( + torch.randn( + points_per_batch, + 256, + 64, + 64, + dtype=mask_generator.predictor.model.sam_mask_decoder._src_dtype, + device=mask_generator.predictor.device, + ), + torch.randn( + points_per_batch, + 256, + 64, + 64, + dtype=mask_generator.predictor.model.sam_mask_decoder._src_dtype, + device=mask_generator.predictor.device, + ), + torch.randn( + points_per_batch, + 8, + 256, + dtype=mask_generator.predictor.model.sam_mask_decoder._src_dtype, + device=mask_generator.predictor.device, + ), + ) + example_input_kwargs = {} + aot_compile( + model_directory, + "sam2_sam_mask_decoder_transformer", + mask_generator.predictor.model.sam_mask_decoder.transformer, + example_input_args, + sample_kwargs=example_input_kwargs, + overwrite=overwrite, + ) - if task_type in ["sps"]: + if task_type in ["amg", "sps"]: example_input_high_res_feats = [ torch.randn( batch_size, @@ -239,7 +362,6 @@ def export_model( "boxes": None, "mask_input": None, "multimask_output": True, - "img_idx": -1, } sam2_image_predict_masks = SAM2ImagePredictor_predict_masks( @@ -301,30 +423,54 @@ def load_exported_model( pkg_m = LoadedModel(pkg) mask_generator.predictor.model.image_encoder = pkg_m - print(f"End load image encoder. Took {time.time() - t0}s") - return mask_generator - - if task_type in ["amg", "mps"]: + if task_type in ["mps"]: return mask_generator - path = Path(model_directory) / Path("sam2_image_predict_masks.pt2") - assert path.exists(), f"Expected {path} to exist" - print(f"Start load from {path}") - pkg = torch._inductor.aoti_load_package(str(path)) - if task_type == "amg": - assert points_per_batch > 1 - if task_type == "sps": - assert points_per_batch == 1 - if task_type == "mps": - assert points_per_batch is None - pkg_m = SAM2ImagePredictor_predict_masks( - None, - batch_size=batch_size, - points_per_batch=points_per_batch, - aoti_compiled_model=pkg, - furious=furious, - ) - mask_generator.predictor._predict_masks = pkg_m.forward + if task_type in []: + path = Path(model_directory) / Path("sam2_sam_prompt_encoder.pt2") + assert path.exists(), f"Expected {path} to exist" + print(f"Start load from {path}") + pkg = torch._inductor.aoti_load_package(str(path)) + pkg_m = LoadedModel(pkg) + mask_generator.predictor.model.sam_prompt_encoder.forward = pkg_m.forward + + if task_type in []: + path = Path(model_directory) / Path("sam2_sam_mask_decoder.pt2") + assert path.exists(), f"Expected {path} to exist" + print(f"Start load from {path}") + pkg = torch._inductor.aoti_load_package(str(path)) + pkg_m = LoadedModel(pkg) + mask_generator.predictor.model.sam_mask_decoder.forward = pkg_m.forward + + if task_type in []: + path = Path(model_directory) / Path("sam2_sam_mask_decoder_transformer.pt2") + assert path.exists(), f"Expected {path} to exist" + print(f"Start load from {path}") + pkg = torch._inductor.aoti_load_package(str(path)) + pkg_m = LoadedModel(pkg) + mask_generator.predictor.model.sam_mask_decoder.transformer.forward = ( + pkg_m.forward + ) + + if task_type in ["amg", "sps"]: + path = Path(model_directory) / Path("sam2_image_predict_masks.pt2") + assert path.exists(), f"Expected {path} to exist" + print(f"Start load from {path}") + pkg = torch._inductor.aoti_load_package(str(path)) + if task_type == "amg": + assert points_per_batch > 1 + if task_type == "sps": + assert points_per_batch == 1 + if task_type == "mps": + assert points_per_batch is None + pkg_m = SAM2ImagePredictor_predict_masks( + None, + batch_size=batch_size, + points_per_batch=points_per_batch, + aoti_compiled_model=pkg, + furious=furious, + ) + mask_generator.predictor._predict_masks = pkg_m.forward print(f"End load image encoder and predict masks. Took {time.time() - t0}s") @@ -352,12 +498,13 @@ def set_fast( dynamic=False, ) elif task_type == "amg": - mask_generator.predictor._predict_masks = torch.compile( - mask_generator.predictor._predict_masks, - mode="max-autotune", - fullgraph=True, - dynamic=False, - ) + if not loaded_exported_model: + mask_generator.predictor._predict_masks = torch.compile( + mask_generator.predictor._predict_masks, + mode="max-autotune", + fullgraph=True, + dynamic=False, + ) else: # TODO: This might need to be under "allow_recompiles" # mps encounters rapidly changing points per batch diff --git a/examples/sam2_amg_server/generate_data.py b/examples/sam2_amg_server/generate_data.py index 7c61a7f728..8632f0163a 100644 --- a/examples/sam2_amg_server/generate_data.py +++ b/examples/sam2_amg_server/generate_data.py @@ -21,6 +21,38 @@ from tqdm import tqdm +def profiler_runner(path, fn, *args, **kwargs): + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=True, + ) as prof: + result = fn(*args, **kwargs) + prof.export_chrome_trace(path) + return result + + +def memory_runner(path, fn, *args, **kwargs): + print("Start memory recording") + torch.cuda.synchronize() + torch.cuda.memory._record_memory_history( + True, trace_alloc_max_entries=100000, trace_alloc_record_context=True + ) + result = fn(*args, **kwargs) + torch.cuda.synchronize() + snapshot = torch.cuda.memory._snapshot() + print("Finish memory recording") + import pickle + + with open(path, "wb") as f: + pickle.dump(snapshot, f) + # Use to convert pickle file into html + # python torch/cuda/_memory_viz.py trace_plot .pickle -o .html + return result + + def latencies_statistics(data): # Convert the list to a NumPy array data_array = np.array(data) @@ -330,16 +362,17 @@ def decode_img_bytes(img_bytes_tensors, gpu_preproc, baseline): for img_bytes_tensor in img_bytes_tensors: with record_function("decode image bytes"): if gpu_preproc: - # NOTE: We have to use numpy for the baseline - assert not baseline - from torchvision import io as tio - - image_tensor = tio.decode_jpeg( - img_bytes_tensor, device="cuda", mode=tio.ImageReadMode.RGB - ) - from torchvision.transforms.v2 import functional as F + image_tensor = file_bytes_to_image_tensor(img_bytes_tensor) + from torchvision.transforms import ToTensor, v2 - image_tensor = F.to_dtype(image_tensor, torch.float32, scale=True) + if not baseline: + image_tensor = torch.from_numpy(image_tensor) + image_tensor = image_tensor.permute((2, 0, 1)) + image_tensor = image_tensor.cuda() + with record_function("v2.ToDtype"): + image_tensor = v2.ToDtype(torch.float32, scale=True)( + image_tensor + ) else: image_tensor = file_bytes_to_image_tensor(img_bytes_tensor) from torchvision.transforms import ToTensor @@ -431,6 +464,7 @@ def main( quiet=False, gpu_preproc=False, batch_size=1, + seed=42, ): if batch_size <= 0: raise ValueError("Expected --batch_size to be at least 1 but got {batch_size}") @@ -502,6 +536,7 @@ def main( from torchao._models.sam2.utils.amg import ( mask_to_rle_pytorch_2 as mask_to_rle_pytorch, ) + torch.manual_seed(seed) device = "cuda" sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type) if verbose: @@ -628,4 +663,5 @@ def main( main.__doc__ = main_docstring() if __name__ == "__main__": # profiler_runner("asdf.json.gz", fire.Fire, main) + # memory_runner("asdf.pickle", fire.Fire, main) fire.Fire(main) diff --git a/examples/sam2_amg_server/reproduce_experiments.py b/examples/sam2_amg_server/reproduce_experiments.py index 2684cd8111..c6799cd815 100644 --- a/examples/sam2_amg_server/reproduce_experiments.py +++ b/examples/sam2_amg_server/reproduce_experiments.py @@ -89,7 +89,7 @@ def run(task, output_path: Path, kwargs, baseline_folder=None, environ=None): stdout, stderr = run_script_with_args( [ "generate_data.py", - "~/checkpoints/sam2", + f"{str(Path.home())}/checkpoints/sam2", "large", task, image_paths, diff --git a/examples/sam2_amg_server/result.csv b/examples/sam2_amg_server/result.csv index aa43a8703e..0327159727 100644 --- a/examples/sam2_amg_server/result.csv +++ b/examples/sam2_amg_server/result.csv @@ -1,70 +1,70 @@ -p999,task,experiment_name,fourth,total_time,third,bytes_MiB,environ,allow-recompiles,p95,fail_count,torchvision_version,export-model,furious,baseline,max,bytes,fifth,argmax,meta-folder,batch-size,load-exported-model,torch_version,run_script_time,total_img_s,p99,second,total_ms_per_img,miou,num-images,fast,first,gpu-preproc,percentage,points-per-batch,median,mean,batch_size -2374ms,amg,baseline_amg,887ms,935.2057137489319s,947ms,4350,None,,1336ms,,0.22.0.dev20250109+cu124,,,None,2454ms,4561654784,717ms,222,,,,2.7.0.dev20250109+cu124,939.5637674331665,1.0692834584931363img/s,2148ms,1054ms,935.2057137489319ms,,,,1799ms,,4,64,872ms,928ms,1 -950ms,amg,amg_ao,716ms,727.5543773174286s,725ms,4010,None,,824ms,0.0,0.22.0.dev20250109+cu124,,,,1307ms,4205527040,713ms,0,,,,2.7.0.dev20250109+cu124,731.9675371646881,1.3744677115229624img/s,870ms,805ms,727.5543773174286ms,1.0,,,1307ms,,4,64,706ms,721ms,1 -1109ms,amg,amg_ao_ppb_1024_basic,574ms,643.2957496643066s,660ms,33774,None,,749ms,0.0,0.22.0.dev20250109+cu124,,,,1958ms,35415179776,575ms,109,,1,,2.7.0.dev20250109+cu124,647.9796307086945,1.5544949590011028img/s,806ms,615ms,643.2957496643066ms,0.9999994533658028,,,1108ms,,34,1024,622ms,637ms,1 -2781ms,amg,amg_ao_ppb_1024_fast_cold,410ms,877.4602742195129s,518ms,29349,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_inductor_cache_dir'},,546ms,,0.22.0.dev20250109+cu124,,,,427232ms,30775568896,394ms,0,,1,,2.7.0.dev20250109+cu124,886.4245429039001,1.1396527334408206img/s,607ms,2356ms,877.4602742195129ms,,,None,427232ms,,30,1024,423ms,870ms,1 -1392ms,amg,amg_ao_ppb_1024_fast,404ms,455.4250349998474s,440ms,29349,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_inductor_cache_dir'},,548ms,189.0,0.22.0.dev20250109+cu124,,,,8721ms,30775568896,486ms,0,,1,,2.7.0.dev20250109+cu124,460.94617104530334,2.1957510526410458img/s,607ms,1133ms,455.4250349998474ms,0.9936933217227973,,None,8721ms,,30,1024,425ms,448ms,1 -,amg,amg_ao_ppb_1024_save_export,,304.58769369125366s,,1593,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_inductor_cache_dir'},,,,0.22.0.dev20250109+cu124,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast,,,,1670930432,,,,1,,2.7.0.dev20250109+cu124,315.2948203086853,0.0img/s,,,,,0,,,,1,1024,,,1 -1061ms,amg,amg_ao_ppb_1024_load_export_cold,565ms,634.6407806873322s,631ms,32958,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_load_export_inductor_cache_dir'},,739ms,186.0,0.22.0.dev20250109+cu124,,,,1770ms,34559617024,680ms,10,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast,2.7.0.dev20250109+cu124,639.0105745792389,1.5756945195311503img/s,822ms,610ms,634.6407806873322ms,0.9945775083007625,,,1061ms,,33,1024,612ms,628ms,1 -1046ms,amg,amg_ao_ppb_1024_load_export,587ms,622.3058869838715s,603ms,32958,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_load_export_inductor_cache_dir'},,720ms,186.0,0.22.0.dev20250109+cu124,,,,1747ms,34559617024,564ms,10,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast,2.7.0.dev20250109+cu124,626.9090824127197,1.606926787799964img/s,759ms,611ms,622.3058869838715ms,0.9945775083007625,,,1045ms,,33,1024,599ms,616ms,1 -1704ms,amg,amg_ao_ppb_1024_load_export_gpu_preproc,603ms,612.9062254428864s,595ms,32982,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_load_export_inductor_cache_dir'},,699ms,772.0,0.22.0.dev20250109+cu124,,,,1730ms,34584782848,629ms,10,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast,2.7.0.dev20250109+cu124,617.6570754051208,1.631570962225746img/s,746ms,678ms,612.9062254428864ms,0.839199618648803,,,1704ms,None,33,1024,594ms,606ms,1 -1505ms,amg,amg_ao_ppb_1024_fast_export_cold,483ms,561.7602450847626s,456ms,28534,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_inductor_cache_dir'},,567ms,186.0,0.22.0.dev20250109+cu124,,,,104358ms,29921054720,414ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast,2.7.0.dev20250109+cu124,567.9983367919922,1.7801188474081369img/s,634ms,1065ms,561.7602450847626ms,0.994521583840068,,None,104358ms,,29,1024,435ms,554ms,1 -1476ms,amg,amg_ao_ppb_1024_fast_export,389ms,446.44090843200684s,424ms,28534,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_inductor_cache_dir'},,541ms,186.0,0.22.0.dev20250109+cu124,,,,3661ms,29921054720,380ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast,2.7.0.dev20250109+cu124,451.4739100933075,2.239938099562174img/s,635ms,742ms,446.44090843200684ms,0.994521583840068,,None,3661ms,,29,1024,421ms,439ms,1 -1432ms,amg,amg_ao_ppb_1024_fast_export_gpu_preproc,378ms,433.64031982421875s,411ms,28631,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_inductor_cache_dir'},,513ms,772.0,0.22.0.dev20250109+cu124,,,,4632ms,30022200320,441ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast,2.7.0.dev20250109+cu124,439.1623215675354,2.306058625741633img/s,572ms,784ms,433.64031982421875ms,0.8391996832205015,,None,4632ms,None,29,1024,408ms,425ms,1 -2751ms,amg,amg_ao_ppb_1024_fast_furious_cold,163ms,841.2357618808746s,157ms,28335,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_furious_inductor_cache_dir'},,258ms,313.0,0.22.0.dev20250109+cu124,,None,,663906ms,29712144384,165ms,0,,1,,2.7.0.dev20250109+cu124,852.4052486419678,1.188727399990881img/s,307ms,2090ms,841.2357618808746ms,0.9721227795145918,,None,663906ms,,29,1024,158ms,833ms,1 -1106ms,amg,amg_ao_ppb_1024_fast_furious,167ms,182.73960876464844s,161ms,28335,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_furious_inductor_cache_dir'},,253ms,313.0,0.22.0.dev20250109+cu124,,None,,8233ms,29712144384,127ms,0,,1,,2.7.0.dev20250109+cu124,188.4141879081726,5.472267379580016img/s,312ms,1099ms,182.73960876464844ms,0.9721227795145918,,None,8233ms,,29,1024,158ms,176ms,1 -,amg,amg_ao_ppb_1024_save_export_furious,,426.2127423286438s,,954,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_furious_inductor_cache_dir'},,,,0.22.0.dev20250109+cu124,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,None,,,1000953344,,,,1,,2.7.0.dev20250109+cu124,434.3983988761902,0.0img/s,,,,,0,,,,0,1024,,,1 -1016ms,amg,amg_ao_ppb_1024_load_export_furious_cold,340ms,349.6220052242279s,332ms,27972,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_load_export_furious_inductor_cache_dir'},,427ms,203.0,0.22.0.dev20250109+cu124,,None,,2024ms,29330775040,302ms,468,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,353.6907768249512,2.860231864864044img/s,471ms,344ms,349.6220052242279ms,0.9895564557019261,,,1015ms,,28,1024,332ms,343ms,1 -1041ms,amg,amg_ao_ppb_1024_load_export_furious,301ms,360.9945259094238s,331ms,27972,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_load_export_furious_inductor_cache_dir'},,440ms,203.0,0.22.0.dev20250109+cu124,,None,,1978ms,29330775040,301ms,468,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,364.9874835014343,2.7701251077998545img/s,492ms,343ms,360.9945259094238ms,0.9895564557019261,,,1040ms,,28,1024,343ms,355ms,1 -1701ms,amg,amg_ao_ppb_1024_load_export_furious_gpu_preproc,299ms,329.88597416877747s,329ms,28039,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_load_export_furious_inductor_cache_dir'},,399ms,760.0,0.22.0.dev20250109+cu124,,None,,1966ms,29401540096,297ms,468,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,334.0973074436188,3.0313504613820785img/s,449ms,340ms,329.88597416877747ms,0.8335056624064843,,,1701ms,None,28,1024,308ms,324ms,1 -1170ms,amg,amg_ao_ppb_1024_fast_export_furious_cold,165ms,450.325879573822s,189ms,27949,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_furious_inductor_cache_dir'},,269ms,303.0,0.22.0.dev20250109+cu124,,None,,261209ms,29307650560,164ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,456.4792420864105,2.220614104937466img/s,319ms,770ms,450.325879573822ms,0.9750078081486044,,None,261209ms,,28,1024,170ms,443ms,1 -935ms,amg,amg_ao_ppb_1024_fast_export_furious,166ms,177.67218565940857s,182ms,27949,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_furious_inductor_cache_dir'},,253ms,303.0,0.22.0.dev20250109+cu124,,None,,3415ms,29307650560,128ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,183.61352038383484,5.628342986205873img/s,310ms,565ms,177.67218565940857ms,0.9750078081486044,,None,3415ms,,28,1024,157ms,171ms,1 -44632ms,amg,amg_ao_ppb_1024_fast_export_furious_recompiles,115ms,295.7107162475586s,132ms,13255,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_furious_inductor_cache_dir'},None,197ms,305.0,0.22.0.dev20250109+cu124,,None,,63790ms,13898889728,168ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,301.4011402130127,3.3816833312284675img/s,237ms,454ms,295.7107162475586ms,0.9750330227313282,,None,63790ms,,13,1024,139ms,289ms,1 -885ms,amg,amg_ao_ppb_1024_fast_export_furious_gpu_preproc,125ms,156.32159233093262s,155ms,27973,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_furious_inductor_cache_dir'},,224ms,773.0,0.22.0.dev20250109+cu124,,None,,4151ms,29332738048,120ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,162.26802515983582,6.3970689211187235img/s,275ms,396ms,156.32159233093262ms,0.8382131132391581,,None,4151ms,None,28,1024,132ms,150ms,1 -610ms,amg,amg_ao_ppb_1024_fast_export_furious_gpu_preproc_recompiles,114ms,138.77052688598633s,132ms,13227,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_fast_export_furious_inductor_cache_dir'},None,167ms,774.0,0.22.0.dev20250109+cu124,,None,,4890ms,13870295552,112ms,0,,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/amg_ao_fast_furious,2.7.0.dev20250109+cu124,144.96051049232483,7.206141119732136img/s,197ms,395ms,138.77052688598633ms,0.8381459507926375,,None,4890ms,None,13,1024,118ms,130ms,1 -306ms,sps,baseline_sps,100ms,132.67345762252808s,105ms,1337,None,,194ms,,0.22.0.dev20250109+cu124,,,None,571ms,1402492416,104ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,,,2.7.0.dev20250109+cu124,136.57290863990784,7.537302621939047img/s,276ms,222ms,132.67345762252808ms,,,,571ms,,1,1,113ms,127ms,1 -230ms,sps,sps_ao,98ms,126.97674512863159s,118ms,1339,None,,211ms,0.0,0.22.0.dev20250109+cu124,,,,545ms,1404942848,218ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,,,2.7.0.dev20250109+cu124,131.24220395088196,7.875457816996075img/s,222ms,115ms,126.97674512863158ms,1.0,,,545ms,,1,1,109ms,122ms,1 -232ms,sps,sps_ao_ppb_1_basic,100ms,136.22252011299133s,106ms,1339,None,,218ms,0.0,0.22.0.dev20250109+cu124,,,,638ms,1404942848,112ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,140.56182503700256,7.340930113248078img/s,225ms,117ms,136.22252011299133ms,1.0,,,638ms,,1,1,111ms,131ms,1 -3133ms,sps,sps_ao_ppb_1_fast_cold,91ms,524.464339017868s,97ms,1593,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_inductor_cache_dir'},,190ms,,0.22.0.dev20250109+cu124,,,,401201ms,1670930432,96ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,535.5261473655701,1.9067073308981088img/s,210ms,2734ms,524.464339017868ms,,,None,401201ms,,1,1,100ms,515ms,1 -779ms,sps,sps_ao_ppb_1_fast,212ms,132.37645173072815s,202ms,1302,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_inductor_cache_dir'},,206ms,0.0,0.22.0.dev20250109+cu124,,,,8140ms,1366200320,208ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,138.50028347969055,7.5542136605545img/s,213ms,772ms,132.37645173072815ms,0.9998687426447869,,None,8140ms,,1,1,101ms,126ms,1 -,sps,sps_ao_ppb_1_save_export,,272.5903356075287s,,1593,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_inductor_cache_dir'},,,,0.22.0.dev20250109+cu124,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast,,,,1670930432,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,283.19432258605957,0.0img/s,,,,,0,,,,1,1,,,1 -226ms,sps,sps_ao_ppb_1_load_export_cold,213ms,161.28311896324158s,211ms,5949,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_load_export_inductor_cache_dir'},,216ms,0.0,0.22.0.dev20250109+cu124,,,,707ms,6238665728,185ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast,2.7.0.dev20250109+cu124,165.69491052627563,6.2002769194208875img/s,221ms,225ms,161.28311896324158ms,0.999868677020073,,,707ms,,6,1,139ms,155ms,1 -245ms,sps,sps_ao_ppb_1_load_export,93ms,131.32559871673584s,98ms,5949,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_load_export_inductor_cache_dir'},,211ms,0.0,0.22.0.dev20250109+cu124,,,,597ms,6238665728,98ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast,2.7.0.dev20250109+cu124,136.12982988357544,7.614661648388603img/s,220ms,134ms,131.32559871673584ms,0.999868677020073,,,597ms,,6,1,104ms,125ms,1 -196ms,sps,sps_ao_ppb_1_load_export_gpu_preproc,159ms,117.73162794113159s,164ms,5971,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_load_export_inductor_cache_dir'},,162ms,0.0,0.22.0.dev20250109+cu124,,,,1361ms,6261886976,164ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast,2.7.0.dev20250109+cu124,122.47605919837952,8.493894270280727img/s,171ms,139ms,117.73162794113159ms,0.9861222158936289,,,1361ms,None,6,1,101ms,111ms,1 -228ms,sps,sps_ao_ppb_1_fast_export_cold,92ms,120.34239029884338s,96ms,5949,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_inductor_cache_dir'},,203ms,0.0,0.22.0.dev20250109+cu124,,,,541ms,6238665728,97ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast,2.7.0.dev20250109+cu124,124.82643246650696,8.309623878308582img/s,215ms,155ms,120.34239029884338ms,0.999868677020073,,None,541ms,,6,1,101ms,114ms,1 -229ms,sps,sps_ao_ppb_1_fast_export,135ms,120.78508996963501s,96ms,5949,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_inductor_cache_dir'},,203ms,0.0,0.22.0.dev20250109+cu124,,,,570ms,6238665728,116ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast,2.7.0.dev20250109+cu124,124.93209862709045,8.279167571522253img/s,212ms,106ms,120.78508996963501ms,0.999868677020073,,None,570ms,,6,1,102ms,115ms,1 -184ms,sps,sps_ao_ppb_1_fast_export_gpu_preproc,92ms,120.33534979820251s,94ms,5971,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_inductor_cache_dir'},,164ms,0.0,0.22.0.dev20250109+cu124,,,,1240ms,6261886976,93ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast,2.7.0.dev20250109+cu124,124.94753289222717,8.310110052257789img/s,169ms,108ms,120.33534979820251ms,0.9861222158936289,,None,1240ms,None,6,1,97ms,114ms,1 -2368ms,sps,sps_ao_ppb_1_fast_furious_cold,19ms,581.2481288909912s,24ms,954,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_furious_inductor_cache_dir'},,70ms,0.0,0.22.0.dev20250109+cu124,,None,,532242ms,1000953344,35ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,592.1693325042725,1.7204356458023844img/s,74ms,1838ms,581.2481288909912ms,0.9996674702763557,,None,532242ms,,0,1,35ms,574ms,1 -614ms,sps,sps_ao_ppb_1_fast_furious,53ms,45.71470355987549s,25ms,861,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_furious_inductor_cache_dir'},,60ms,0.0,0.22.0.dev20250109+cu124,,None,,8026ms,903450624,23ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,51.57617497444153,21.874800056184018img/s,68ms,606ms,45.71470355987549ms,0.9996674702763557,,None,8026ms,,0,1,29ms,40ms,1 -,sps,sps_ao_ppb_1_save_export_furious,,364.1186008453369s,,954,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_furious_inductor_cache_dir'},,,,0.22.0.dev20250109+cu124,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,None,,,1000953344,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,372.80925726890564,0.0img/s,,,,,0,,,,0,1,,,1 -78ms,sps,sps_ao_ppb_1_load_export_furious_cold,50ms,53.28082203865051s,43ms,1790,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_load_export_furious_inductor_cache_dir'},,69ms,0.0,0.22.0.dev20250109+cu124,,None,,939ms,1877512192,24ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,57.669695138931274,18.76847919640933img/s,74ms,73ms,53.28082203865051ms,0.9998199329972267,,,939ms,,1,1,48ms,47ms,1 -80ms,sps,sps_ao_ppb_1_load_export_furious,21ms,50.997873306274414s,24ms,1790,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_load_export_furious_inductor_cache_dir'},,70ms,0.0,0.22.0.dev20250109+cu124,,None,,861ms,1877512192,24ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,55.45322823524475,19.60866081599852img/s,74ms,33ms,50.997873306274414ms,0.9998199329972267,,,861ms,,1,1,42ms,45ms,1 -29ms,sps,sps_ao_ppb_1_load_export_furious_gpu_preproc,17ms,24.790576696395874s,18ms,1814,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_load_export_furious_inductor_cache_dir'},,19ms,0.0,0.22.0.dev20250109+cu124,,None,,1612ms,1902484480,18ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,29.53805947303772,40.33790791746216img/s,19ms,27ms,24.790576696395874ms,0.9860970453268383,,,1612ms,None,1,1,17ms,19ms,1 -82ms,sps,sps_ao_ppb_1_fast_export_furious_cold,20ms,39.87857627868652s,36ms,1790,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_furious_inductor_cache_dir'},,61ms,0.0,0.22.0.dev20250109+cu124,,None,,866ms,1877512192,25ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,44.19964957237244,25.076120897888206img/s,71ms,35ms,39.87857627868652ms,0.9998199329972267,,None,866ms,,1,1,31ms,34ms,1 -75ms,sps,sps_ao_ppb_1_fast_export_furious,20ms,40.75656461715698s,24ms,1790,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_furious_inductor_cache_dir'},,64ms,0.0,0.22.0.dev20250109+cu124,,None,,865ms,1877512192,26ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,45.36444664001465,24.53592468829028img/s,70ms,34ms,40.75656461715698ms,0.9998199329972267,,None,865ms,,1,1,31ms,35ms,1 -93ms,sps,sps_ao_ppb_1_fast_export_furious_recompiles,21ms,49.636521339416504s,25ms,1790,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_furious_inductor_cache_dir'},None,66ms,0.0,0.22.0.dev20250109+cu124,,None,,9723ms,1877512192,25ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,55.89960026741028,20.146456137849796img/s,73ms,37ms,49.636521339416504ms,0.24249802377738716,,None,9723ms,,1,1,31ms,44ms,1 -29ms,sps,sps_ao_ppb_1_fast_export_furious_gpu_preproc,17ms,24.562424421310425s,19ms,1814,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_furious_inductor_cache_dir'},,19ms,0.0,0.22.0.dev20250109+cu124,,None,,1566ms,1902484480,18ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,29.499178171157837,40.71259346583057img/s,19ms,27ms,24.562424421310425ms,0.9860970453268383,,None,1566ms,None,1,1,17ms,19ms,1 -32ms,sps,sps_ao_ppb_1_fast_export_furious_gpu_preproc_recompiles,17ms,26.11998414993286s,19ms,1814,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/sps_fast_export_furious_inductor_cache_dir'},None,19ms,0.0,0.22.0.dev20250109+cu124,,None,,3477ms,1902484480,18ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/sps_ao_fast_furious,2.7.0.dev20250109+cu124,32.0809326171875,38.284862435591116img/s,20ms,29ms,26.11998414993286ms,0.18694353939804045,,None,3477ms,None,1,1,17ms,21ms,1 -1614ms,mps,baseline_mps,217ms,339.7126615047455s,368ms,1337,None,,738ms,,0.22.0.dev20250109+cu124,,,None,1837ms,1402492416,510ms,126,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,,,2.7.0.dev20250109+cu124,344.3770024776459,2.943664200122935img/s,1304ms,490ms,339.7126615047455ms,,,,579ms,,1,,263ms,332ms,1 -385ms,mps,mps_ao,104ms,139.90302205085754s,118ms,8022,None,,215ms,0.0,0.22.0.dev20250109+cu124,,,,600ms,8411699712,150ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,,,2.7.0.dev20250109+cu124,144.1774024963379,7.147808427158064img/s,237ms,132ms,139.90302205085754ms,0.999999164044857,,,600ms,,8,,121ms,133ms,1 -295ms,mps,mps_ao_ppb_None_basic,216ms,180.09048891067505s,231ms,8022,None,,236ms,0.0,0.22.0.dev20250109+cu124,,,,622ms,8411699712,246ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,184.8732569217682,5.55276409125637img/s,263ms,236ms,180.09048891067505ms,0.999999164044857,,,622ms,,8,,162ms,171ms,1 -43126ms,mps,mps_ao_ppb_None_fast_cold,93ms,531.2832531929016s,104ms,8021,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_inductor_cache_dir'},,208ms,,0.22.0.dev20250109+cu124,,,,331945ms,8411176448,110ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,543.5350062847137,1.8822351240890964img/s,224ms,1009ms,531.2832531929016ms,,,None,331945ms,,8,,107ms,524ms,1 -1451ms,mps,mps_ao_ppb_None_fast,95ms,177.8515875339508s,109ms,8021,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_inductor_cache_dir'},,226ms,0.0,0.22.0.dev20250109+cu124,,,,8897ms,8411176448,147ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,183.4075665473938,5.622665582386809img/s,248ms,581ms,177.8515875339508ms,0.9983835342526436,,None,8897ms,,8,,146ms,170ms,1 -,mps,mps_ao_ppb_None_save_export,,262.2255263328552s,,1593,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_inductor_cache_dir'},,,,0.22.0.dev20250109+cu124,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast,,,,1670930432,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,270.12541913986206,0.0img/s,,,,,0,,,,1,,,,1 -333ms,mps,mps_ao_ppb_None_load_export_cold,97ms,138.29926824569702s,111ms,7206,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_load_export_inductor_cache_dir'},,220ms,0.0,0.22.0.dev20250109+cu124,,,,649ms,7556661248,120ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast,2.7.0.dev20250109+cu124,142.37936091423035,7.230696247961626img/s,234ms,125ms,138.29926824569702ms,0.9983786268234253,,,649ms,,7,,114ms,131ms,1 -320ms,mps,mps_ao_ppb_None_load_export,96ms,132.98988270759583s,109ms,7206,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_load_export_inductor_cache_dir'},,212ms,0.0,0.22.0.dev20250109+cu124,,,,543ms,7556661248,118ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast,2.7.0.dev20250109+cu124,137.46344566345215,7.519368989885455img/s,235ms,185ms,132.98988270759583ms,0.9983786268234253,,,543ms,,7,,112ms,125ms,1 -369ms,mps,mps_ao_ppb_None_load_export_gpu_preproc,95ms,153.9310953617096s,179ms,7230,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_load_export_inductor_cache_dir'},,184ms,0.0,0.22.0.dev20250109+cu124,,,,1217ms,7581827072,127ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast,2.7.0.dev20250109+cu124,159.28356790542603,6.496413201310528img/s,202ms,139ms,153.9310953617096ms,0.9224205894982442,,,1217ms,None,7,,153ms,145ms,1 -37104ms,mps,mps_ao_ppb_None_fast_export_cold,96ms,236.0241584777832s,107ms,7206,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_inductor_cache_dir'},,206ms,0.0,0.22.0.dev20250109+cu124,,,,39205ms,7556661248,113ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast,2.7.0.dev20250109+cu124,244.1103572845459,4.23685442392597img/s,229ms,119ms,236.0241584777832ms,0.9983784531950951,,None,39205ms,,7,,109ms,227ms,1 -1280ms,mps,mps_ao_ppb_None_fast_export,103ms,132.519935131073s,176ms,7206,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_inductor_cache_dir'},,203ms,0.0,0.22.0.dev20250109+cu124,,,,3634ms,7556661248,155ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast,2.7.0.dev20250109+cu124,137.68328261375427,7.54603448161153img/s,223ms,223ms,132.519935131073ms,0.9983784534335136,,None,3634ms,,7,,109ms,125ms,1 -1267ms,mps,mps_ao_ppb_None_fast_export_gpu_preproc,157ms,147.0070924758911s,181ms,7230,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_inductor_cache_dir'},,175ms,0.0,0.22.0.dev20250109+cu124,,,,3928ms,7581827072,118ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast,2.7.0.dev20250109+cu124,152.5612542629242,6.80239288566297img/s,195ms,185ms,147.0070924758911ms,0.9224205495780334,,None,3928ms,None,7,,131ms,139ms,1 -44108ms,mps,mps_ao_ppb_None_fast_furious_cold,22ms,604.3798043727875s,30ms,4222,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_furious_inductor_cache_dir'},,69ms,0.0,0.22.0.dev20250109+cu124,,None,,488223ms,4427842560,69ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,616.8908636569977,1.654588708565103img/s,80ms,1530ms,604.3798043727875ms,0.9972913320064545,,None,488223ms,,4,,33ms,597ms,1 -1341ms,mps,mps_ao_ppb_None_fast_furious,59ms,78.28538370132446s,66ms,4222,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_furious_inductor_cache_dir'},,79ms,0.0,0.22.0.dev20250109+cu124,,None,,9623ms,4427842560,73ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,84.57566738128662,12.773776568755345img/s,89ms,551ms,78.28538370132446ms,0.9972910861372948,,None,9623ms,,4,,61ms,70ms,1 -,mps,mps_ao_ppb_None_save_export_furious,,349.34193754196167s,,954,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_furious_inductor_cache_dir'},,,,0.22.0.dev20250109+cu124,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,None,,,1000953344,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,,2.7.0.dev20250109+cu124,360.5604326725006,0.0img/s,,,,,0,,,,0,,,,1 -309ms,mps,mps_ao_ppb_None_load_export_furious_cold,34ms,56.33559775352478s,41ms,3813,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_load_export_furious_inductor_cache_dir'},,80ms,0.0,0.22.0.dev20250109+cu124,,None,,765ms,3998387200,43ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,60.93665313720703,17.75076576581514img/s,88ms,54ms,56.33559775352478ms,0.9961582001447677,,,765ms,,3,,44ms,49ms,1 -353ms,mps,mps_ao_ppb_None_load_export_furious,33ms,56.61087965965271s,40ms,3813,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_load_export_furious_inductor_cache_dir'},,80ms,0.0,0.22.0.dev20250109+cu124,,None,,845ms,3998387200,40ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,61.454379081726074,17.664449060181493img/s,88ms,85ms,56.61087965965271ms,0.9961582001447677,,,845ms,,3,,44ms,49ms,1 -322ms,mps,mps_ao_ppb_None_load_export_furious_gpu_preproc,29ms,40.086507081985474s,33ms,3837,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_load_export_furious_inductor_cache_dir'},,39ms,0.0,0.22.0.dev20250109+cu124,,None,,1539ms,4023553024,33ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,44.91008281707764,24.94604975072501img/s,49ms,49ms,40.086507081985474ms,0.9239367794789141,,,1539ms,None,3,,30ms,33ms,1 -32689ms,mps,mps_ao_ppb_None_fast_export_furious_cold,60ms,157.29275488853455s,67ms,3813,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_furious_inductor_cache_dir'},,74ms,0.0,0.22.0.dev20250109+cu124,,None,,45808ms,3998387200,55ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,165.38462448120117,6.35757190919982img/s,89ms,78ms,157.29275488853455ms,0.9969035378098487,,None,45808ms,,3,,38ms,147ms,1 -1401ms,mps,mps_ao_ppb_None_fast_export_furious,60ms,50.659629821777344s,68ms,3813,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_furious_inductor_cache_dir'},,70ms,0.0,0.22.0.dev20250109+cu124,,None,,3938ms,3998387200,70ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,56.82898807525635,19.73958363924176img/s,80ms,77ms,50.659629821777344ms,0.9969037767052651,,None,3938ms,,3,,33ms,43ms,1 -8305ms,mps,mps_ao_ppb_None_fast_export_furious_recompiles,21ms,65.21127843856812s,28ms,3813,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_furious_inductor_cache_dir'},None,63ms,0.0,0.22.0.dev20250109+cu124,,None,,13909ms,3998387200,54ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,71.5342059135437,15.334770670721383img/s,77ms,38ms,65.21127843856812ms,0.9963943874835968,,None,13909ms,,3,,33ms,58ms,1 -1311ms,mps,mps_ao_ppb_None_fast_export_furious_gpu_preproc,19ms,33.9236855506897s,24ms,3837,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_furious_inductor_cache_dir'},,30ms,0.0,0.22.0.dev20250109+cu124,,None,,4556ms,4023553024,26ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,40.050333738327026,29.47792917446345img/s,38ms,31ms,33.9236855506897ms,0.9237591220784234,,None,4556ms,None,3,,20ms,27ms,1 -1649ms,mps,mps_ao_ppb_None_fast_export_furious_gpu_preproc_recompiles,18ms,34.80714464187622s,23ms,3837,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/mps_fast_export_furious_inductor_cache_dir'},None,28ms,0.0,0.22.0.dev20250109+cu124,,None,,5661ms,4023553024,25ms,0,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/amg_baseline_annotations,1,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_11/exported_models/mps_ao_fast_furious,2.7.0.dev20250109+cu124,41.254807472229004,28.729733802895954img/s,34ms,31ms,34.80714464187622ms,0.9227598560500192,,None,5661ms,None,3,,20ms,28ms,1 +furious,fast,points-per-batch,bytes,argmax,p95,p999,p99,miou,fourth,total_time,torch_version,total_img_s,batch-size,second,experiment_name,run_script_time,mean,batch_size,percentage,third,task,num-images,fifth,environ,fail_count,allow-recompiles,max,load-exported-model,torchvision_version,median,total_ms_per_img,gpu-preproc,meta-folder,bytes_MiB,first,baseline,export-model +,,64,4561654784,468,1323ms,2363ms,2086ms,,892ms,927.4758312702179s,2.7.0.dev20250201+cu124,1.0781952114379705img/s,,1046ms,baseline_amg,931.3759133815765,921ms,1,4,955ms,amg,,724ms,None,,,2466ms,,0.22.0.dev20250201+cu124,869ms,927.4758312702179ms,,,4350,1733ms,None, +,,64,4205527040,0,815ms,904ms,857ms,1.0,660ms,718.6690595149994s,2.7.0.dev20250201+cu124,1.3914610442181266img/s,,748ms,amg_ao,723.3117945194244,713ms,1,4,673ms,amg,,760ms,None,0.0,,1263ms,,0.22.0.dev20250201+cu124,697ms,718.6690595149994ms,,,4010,1263ms,, +,,1024,35427762688,109,745ms,1006ms,791ms,0.9999994533658028,577ms,631.6344785690308s,2.7.0.dev20250201+cu124,1.5831941319376708img/s,1,619ms,amg_ao_ppb_1024_basic,635.8103907108307,626ms,1,34,594ms,amg,,609ms,None,0.0,,1947ms,,0.22.0.dev20250201+cu124,610ms,631.6344785690308ms,,,33786,1005ms,, +,None,1024,30775568896,0,576ms,3526ms,644ms,,501ms,849.2408077716827s,2.7.0.dev20250201+cu124,1.1775223126923131img/s,1,3157ms,amg_ao_ppb_1024_fast_cold,861.5647690296173,841ms,1,30,421ms,amg,,501ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_inductor_cache_dir'},,,372124ms,,0.22.0.dev20250201+cu124,466ms,849.2408077716827ms,,,29349,372124ms,, +,None,1024,30775568896,0,541ms,1512ms,617ms,0.9937346105006776,386ms,452.082448720932s,2.7.0.dev20250201+cu124,2.2119858951155487img/s,1,1000ms,amg_ao_ppb_1024_fast,458.1768579483032,446ms,1,30,448ms,amg,,392ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_inductor_cache_dir'},191.0,,8411ms,,0.22.0.dev20250201+cu124,422ms,452.082448720932ms,,,29349,8411ms,, +,,1024,18221665280,,,,,,,356.0369083881378s,2.7.0.dev20250201+cu124,0.0img/s,1,,amg_ao_ppb_1024_save_export,367.34787678718567,,1,17,,amg,0,,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_inductor_cache_dir'},,,,,0.22.0.dev20250201+cu124,,,,,17377,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast +,,1024,49836364288,837,559ms,1592ms,639ms,0.993709121615135,397ms,460.2203013896942s,2.7.0.dev20250201+cu124,2.1728724199701137img/s,1,493ms,amg_ao_ppb_1024_load_export_cold,464.4886541366577,453ms,1,48,443ms,amg,,510ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_load_export_inductor_cache_dir'},188.0,,1760ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast,0.22.0.dev20250201+cu124,436ms,460.2203013896942ms,,,47527,961ms,, +,,1024,49836364288,837,592ms,1691ms,649ms,0.993709121615135,445ms,478.4169816970825s,2.7.0.dev20250201+cu124,2.09022680685939img/s,1,431ms,amg_ao_ppb_1024_load_export,483.0541400909424,472ms,1,48,429ms,amg,,508ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_load_export_inductor_cache_dir'},188.0,,1737ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast,0.22.0.dev20250201+cu124,462ms,478.4169816970825ms,,,47527,763ms,, +,,1024,49861530112,837,565ms,1670ms,622ms,0.9937652501226203,398ms,465.69065976142883s,2.7.0.dev20250201+cu124,2.1473482000096276img/s,1,435ms,amg_ao_ppb_1024_load_export_gpu_preproc,469.74300265312195,460ms,1,48,427ms,amg,,397ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_load_export_inductor_cache_dir'},185.0,,1735ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast,0.22.0.dev20250201+cu124,452ms,465.69065976142883ms,None,,47551,776ms,, +,None,1024,49836364288,837,546ms,1611ms,608ms,0.993709121615135,415ms,454.15750002861023s,2.7.0.dev20250201+cu124,2.201879303847242img/s,1,438ms,amg_ao_ppb_1024_fast_export_cold,458.17887783050537,448ms,1,48,545ms,amg,,421ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_inductor_cache_dir'},188.0,,1730ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast,0.22.0.dev20250201+cu124,430ms,454.15750002861023ms,,,47527,943ms,, +,None,1024,49836364288,837,577ms,1702ms,643ms,0.993709121615135,402ms,473.2662968635559s,2.7.0.dev20250201+cu124,2.112975309307316img/s,1,432ms,amg_ao_ppb_1024_fast_export,477.25709891319275,467ms,1,48,427ms,amg,,486ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_inductor_cache_dir'},188.0,,1742ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast,0.22.0.dev20250201+cu124,451ms,473.2662968635559ms,,,47527,754ms,, +,None,1024,49861530112,837,543ms,1597ms,596ms,0.9937652501226203,396ms,450.6334979534149s,2.7.0.dev20250201+cu124,2.219098235132482img/s,1,433ms,amg_ao_ppb_1024_fast_export_gpu_preproc,454.61152243614197,445ms,1,48,426ms,amg,,395ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_inductor_cache_dir'},185.0,,1766ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast,0.22.0.dev20250201+cu124,430ms,450.6334979534149ms,None,,47551,764ms,, +None,None,1024,29712131072,0,275ms,2880ms,333ms,0.9736336072679046,169ms,994.9303135871887s,2.7.0.dev20250201+cu124,1.0050955190967423img/s,1,2081ms,amg_ao_ppb_1024_fast_furious_cold,1006.4958641529083,987ms,1,29,192ms,amg,,143ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_furious_inductor_cache_dir'},305.0,,800771ms,,0.22.0.dev20250201+cu124,174ms,994.9303135871887ms,,,28335,800771ms,, +None,None,1024,29712131072,0,274ms,933ms,334ms,0.9736336072679046,163ms,192.62348794937134s,2.7.0.dev20250201+cu124,5.191474885258216img/s,1,699ms,amg_ao_ppb_1024_fast_furious,198.63731622695923,186ms,1,29,179ms,amg,,130ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_furious_inductor_cache_dir'},305.0,,10094ms,,0.22.0.dev20250201+cu124,165ms,192.62348794937134ms,,,28335,10094ms,, +None,,1024,9179703808,,,,,,,519.6249597072601s,2.7.0.dev20250201+cu124,0.0img/s,1,,amg_ao_ppb_1024_save_export_furious,529.3503592014313,,1,8,,amg,0,,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_furious_inductor_cache_dir'},,,,,0.22.0.dev20250201+cu124,,,,,8754,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious +None,,1024,29307644416,468,259ms,906ms,309ms,0.971583874842335,166ms,178.88770842552185s,2.7.0.dev20250201+cu124,5.590099000101732img/s,1,202ms,amg_ao_ppb_1024_load_export_furious_cold,183.20707321166992,169ms,1,28,198ms,amg,,169ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_load_export_furious_inductor_cache_dir'},308.0,,1468ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,158ms,178.88770842552185ms,,,27949,906ms,, +None,,1024,29307644416,468,258ms,716ms,299ms,0.971583874842335,167ms,173.60630631446838s,2.7.0.dev20250201+cu124,5.760159416033033img/s,1,164ms,amg_ao_ppb_1024_load_export_furious,177.37090826034546,168ms,1,28,156ms,amg,,125ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_load_export_furious_inductor_cache_dir'},308.0,,1468ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,157ms,173.60630631446838ms,,,27949,716ms,, +None,,1024,29308632576,468,232ms,679ms,282ms,0.9707489542138409,126ms,156.5510959625244s,2.7.0.dev20250201+cu124,6.387690829321198img/s,1,160ms,amg_ao_ppb_1024_load_export_furious_gpu_preproc,160.46401953697205,151ms,1,28,155ms,amg,,126ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_load_export_furious_inductor_cache_dir'},290.0,,1467ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,136ms,156.5510959625244ms,None,,27950,678ms,, +None,None,1024,29307644416,468,268ms,750ms,320ms,0.971583874842335,159ms,182.61804270744324s,2.7.0.dev20250201+cu124,5.4759101848551435img/s,1,162ms,amg_ao_ppb_1024_fast_export_furious_cold,187.25734424591064,177ms,1,28,158ms,amg,,149ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_furious_inductor_cache_dir'},308.0,,1466ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,165ms,182.61804270744324ms,,,27949,750ms,, +None,None,1024,29307644416,468,259ms,700ms,308ms,0.971583874842335,134ms,178.3385353088379s,2.7.0.dev20250201+cu124,5.607313070437913img/s,1,160ms,amg_ao_ppb_1024_fast_export_furious,182.3735547065735,173ms,1,28,157ms,amg,,162ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_furious_inductor_cache_dir'},308.0,,1507ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,163ms,178.3385353088379ms,,,27949,700ms,, +None,None,1024,16525926912,0,201ms,36421ms,227ms,0.9716291864482343,141ms,245.76354837417603s,2.7.0.dev20250201+cu124,4.068951667630937img/s,1,137ms,amg_ao_ppb_1024_fast_export_furious_recompiles,251.90375113487244,240ms,1,16,131ms,amg,,128ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_furious_inductor_cache_dir'},311.0,None,49208ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,140ms,245.76354837417603ms,,,15760,49208ms,, +None,None,1024,29308632576,468,233ms,774ms,283ms,0.9707489542138409,127ms,157.9279761314392s,2.7.0.dev20250201+cu124,6.3320003491194425img/s,1,163ms,amg_ao_ppb_1024_fast_export_furious_gpu_preproc,162.7095422744751,152ms,1,28,157ms,amg,,129ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_furious_inductor_cache_dir'},290.0,,1464ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,137ms,157.9279761314392ms,None,,27950,773ms,, +None,None,1024,16551092736,0,174ms,308ms,203ms,0.9708677416053486,115ms,137.26364755630493s,2.7.0.dev20250201+cu124,7.28525008480344img/s,1,135ms,amg_ao_ppb_1024_fast_export_furious_gpu_preproc_recompiles,142.44125938415527,130ms,1,16,135ms,amg,,116ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_fast_export_furious_inductor_cache_dir'},293.0,None,2189ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/amg_ao_fast_furious,0.22.0.dev20250201+cu124,121ms,137.26364755630493ms,None,,15784,2189ms,, +,,1,1402492416,0,214ms,316ms,281ms,,100ms,136.17227387428284s,2.7.0.dev20250201+cu124,7.343638844741783img/s,,118ms,baseline_sps,140.2417643070221,131ms,1,1,105ms,sps,,227ms,None,,,532ms,,0.22.0.dev20250201+cu124,115ms,136.17227387428284ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1337,532ms,None, +,,1,1404942848,0,205ms,229ms,219ms,1.0,105ms,127.24607348442078s,2.7.0.dev20250201+cu124,7.858788665274091img/s,,105ms,sps_ao,131.5206482410431,122ms,1,1,102ms,sps,,225ms,None,0.0,,579ms,,0.22.0.dev20250201+cu124,110ms,127.24607348442076ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1339,579ms,, +,,1,1404989952,0,203ms,256ms,218ms,1.0,106ms,124.8940806388855s,2.7.0.dev20250201+cu124,8.006784588065194img/s,1,104ms,sps_ao_ppb_1_basic,128.7957148551941,120ms,1,1,102ms,sps,,217ms,None,0.0,,583ms,,0.22.0.dev20250201+cu124,109ms,124.8940806388855ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1339,583ms,, +,None,1,1408784896,0,216ms,3260ms,223ms,,201ms,488.7042841911316s,2.7.0.dev20250201+cu124,2.046227201906217img/s,1,2959ms,sps_ao_ppb_1_fast_cold,496.82423877716064,483ms,1,1,212ms,sps,,209ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_inductor_cache_dir'},,,304090ms,,0.22.0.dev20250201+cu124,203ms,488.7042841911316ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1343,304090ms,, +,None,1,1366200320,0,217ms,775ms,222ms,0.9998691322207451,122ms,196.3028929233551s,2.7.0.dev20250201+cu124,5.0941684307752img/s,1,768ms,sps_ao_ppb_1_fast,202.54180693626404,189ms,1,1,195ms,sps,,208ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_inductor_cache_dir'},0.0,,8209ms,,0.22.0.dev20250201+cu124,205ms,196.3028929233551ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1302,8209ms,, +,,1,1390578176,,,,,,,307.4514627456665s,2.7.0.dev20250201+cu124,0.0img/s,1,,sps_ao_ppb_1_save_export,316.7780604362488,,1,1,,sps,0,,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_inductor_cache_dir'},,,,,0.22.0.dev20250201+cu124,,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1326,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast +,,1,6238665728,0,215ms,233ms,221ms,0.9998687437176704,202ms,160.5826907157898s,2.7.0.dev20250201+cu124,6.227321235822784img/s,1,221ms,sps_ao_ppb_1_load_export_cold,165.16510462760925,153ms,1,6,198ms,sps,,214ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_load_export_inductor_cache_dir'},0.0,,576ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast,0.22.0.dev20250201+cu124,138ms,160.5826907157898ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,5949,576ms,, +,,1,6238665728,0,213ms,294ms,220ms,0.9998687437176704,210ms,130.84592247009277s,2.7.0.dev20250201+cu124,7.642576712534304img/s,1,108ms,sps_ao_ppb_1_load_export,135.52789616584778,125ms,1,6,144ms,sps,,140ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_load_export_inductor_cache_dir'},0.0,,434ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast,0.22.0.dev20250201+cu124,104ms,130.84592247009277ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,5949,434ms,, +,,1,6261886976,0,165ms,180ms,175ms,0.999868236720562,100ms,118.1360731124878s,2.7.0.dev20250201+cu124,8.46481496847971img/s,1,103ms,sps_ao_ppb_1_load_export_gpu_preproc,122.45444965362549,112ms,1,6,103ms,sps,,98ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_load_export_inductor_cache_dir'},0.0,,488ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast,0.22.0.dev20250201+cu124,103ms,118.1360731124878ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,5971,488ms,, +,None,1,6238665728,0,206ms,226ms,216ms,0.9998687437176704,92ms,124.29203748703003s,2.7.0.dev20250201+cu124,8.045567682518286img/s,1,121ms,sps_ao_ppb_1_fast_export_cold,128.70573449134827,118ms,1,6,135ms,sps,,96ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_inductor_cache_dir'},0.0,,430ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast,0.22.0.dev20250201+cu124,104ms,124.29203748703003ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,5949,430ms,, +,None,1,6238665728,0,200ms,226ms,216ms,0.9998687437176704,99ms,121.70427465438843s,2.7.0.dev20250201+cu124,8.216638263855277img/s,1,99ms,sps_ao_ppb_1_fast_export,126.40637016296387,115ms,1,6,96ms,sps,,105ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_inductor_cache_dir'},0.0,,474ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast,0.22.0.dev20250201+cu124,103ms,121.70427465438843ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,5949,474ms,, +,None,1,6261886976,0,168ms,189ms,178ms,0.999868236720562,93ms,122.82635688781738s,2.7.0.dev20250201+cu124,8.141575027852884img/s,1,107ms,sps_ao_ppb_1_fast_export_gpu_preproc,127.55544590950012,117ms,1,6,98ms,sps,,172ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_inductor_cache_dir'},0.0,,481ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast,0.22.0.dev20250201+cu124,104ms,122.82635688781738ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,5971,481ms,, +None,None,1,903450624,0,66ms,2448ms,71ms,0.9996802344322204,18ms,598.2366213798523s,2.7.0.dev20250201+cu124,1.6715793788977134img/s,1,1896ms,sps_ao_ppb_1_fast_furious_cold,606.6854190826416,590ms,1,0,24ms,sps,,30ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_furious_inductor_cache_dir'},0.0,,553957ms,,0.22.0.dev20250201+cu124,30ms,598.2366213798523ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,861,553957ms,, +None,None,1,903450624,0,60ms,922ms,68ms,0.9996802344322204,19ms,46.42959976196289s,2.7.0.dev20250201+cu124,21.537984499690705img/s,1,914ms,sps_ao_ppb_1_fast_furious,52.85066604614258,40ms,1,0,27ms,sps,,52ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_furious_inductor_cache_dir'},0.0,,8831ms,,0.22.0.dev20250201+cu124,28ms,46.42959976196289ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,861,8831ms,, +None,,1,903450624,,,,,,,395.61680269241333s,2.7.0.dev20250201+cu124,0.0img/s,1,,sps_ao_ppb_1_save_export_furious,405.58058881759644,,1,0,,sps,0,,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_furious_inductor_cache_dir'},,,,,0.22.0.dev20250201+cu124,,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,861,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious +None,,1,1768025088,0,63ms,78ms,70ms,0.9996752961277962,31ms,40.04996109008789s,2.7.0.dev20250201+cu124,24.968813271768536img/s,1,41ms,sps_ao_ppb_1_load_export_furious_cold,44.494996547698975,33ms,1,1,54ms,sps,,58ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_load_export_furious_inductor_cache_dir'},0.0,,688ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,29ms,40.04996109008789ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1686,688ms,, +None,,1,1768025088,0,67ms,98ms,73ms,0.9996752961277962,54ms,41.31868815422058s,2.7.0.dev20250201+cu124,24.20212365570597img/s,1,24ms,sps_ao_ppb_1_load_export_furious,45.522459983825684,36ms,1,1,24ms,sps,,24ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_load_export_furious_inductor_cache_dir'},0.0,,769ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,31ms,41.31868815422058ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1686,769ms,, +None,,1,1794153472,0,28ms,33ms,30ms,0.9996936089992523,18ms,30.337790489196777s,2.7.0.dev20250201+cu124,32.96218952913192img/s,1,21ms,sps_ao_ppb_1_load_export_furious_gpu_preproc,35.1632604598999,22ms,1,1,22ms,sps,,22ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_load_export_furious_inductor_cache_dir'},0.0,,720ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,20ms,30.337790489196777ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1711,720ms,, +None,None,1,1768025088,0,59ms,82ms,69ms,0.9996752961277962,37ms,36.78891086578369s,2.7.0.dev20250201+cu124,27.182103967368906img/s,1,39ms,sps_ao_ppb_1_fast_export_furious_cold,40.70477890968323,31ms,1,1,53ms,sps,,35ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_furious_inductor_cache_dir'},0.0,,752ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,28ms,36.78891086578369ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1686,752ms,, +None,None,1,1768025088,0,62ms,74ms,69ms,0.9996752961277962,45ms,37.20629072189331s,2.7.0.dev20250201+cu124,26.877175353886315img/s,1,39ms,sps_ao_ppb_1_fast_export_furious,41.312560081481934,32ms,1,1,22ms,sps,,23ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_furious_inductor_cache_dir'},0.0,,678ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,29ms,37.20629072189331ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1686,678ms,, +None,None,1,1768025088,0,58ms,82ms,68ms,0.24502152660781712,19ms,44.12568783760071s,2.7.0.dev20250201+cu124,22.662536246015694img/s,1,62ms,sps_ao_ppb_1_fast_export_furious_recompiles,49.61470317840576,38ms,1,1,22ms,sps,,23ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_furious_inductor_cache_dir'},0.0,None,8124ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,28ms,44.12568783760071ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1686,8124ms,, +None,None,1,1794153472,0,26ms,29ms,27ms,0.9996936089992523,16ms,25.35749101638794s,2.7.0.dev20250201+cu124,39.436078252131644img/s,1,20ms,sps_ao_ppb_1_fast_export_furious_gpu_preproc,29.401476621627808,20ms,1,1,20ms,sps,,21ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_furious_inductor_cache_dir'},0.0,,662ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,19ms,25.35749101638794ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1711,662ms,, +None,None,1,1794153472,0,26ms,31ms,27ms,0.22546337781244644,17ms,26.919757604599s,2.7.0.dev20250201+cu124,37.14743701218019img/s,1,21ms,sps_ao_ppb_1_fast_export_furious_gpu_preproc_recompiles,32.35977077484131,22ms,1,1,20ms,sps,,21ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/sps_fast_export_furious_inductor_cache_dir'},0.0,None,2134ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/sps_ao_fast_furious,0.22.0.dev20250201+cu124,19ms,26.919757604599ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1711,2134ms,, +,,,1402492416,126,775ms,1593ms,1171ms,,150ms,331.5782699584961s,2.7.0.dev20250201+cu124,3.0158791772608344img/s,,289ms,baseline_mps,335.87450075149536,324ms,1,1,304ms,mps,,541ms,None,,,1991ms,,0.22.0.dev20250201+cu124,258ms,331.5782699584961ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1337,611ms,None, +,,,8411175424,0,227ms,311ms,239ms,0.999999164044857,105ms,143.97097539901733s,2.7.0.dev20250201+cu124,6.945844446969173img/s,,127ms,mps_ao,148.60355854034424,137ms,1,8,117ms,mps,,127ms,None,0.0,,634ms,,0.22.0.dev20250201+cu124,122ms,143.97097539901733ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,8021,634ms,, +,,,8411175424,0,234ms,309ms,259ms,0.999999164044857,221ms,164.95788407325745s,2.7.0.dev20250201+cu124,6.062153413388245img/s,1,234ms,mps_ao_ppb_None_basic,168.8498158454895,158ms,1,8,231ms,mps,,242ms,None,0.0,,644ms,,0.22.0.dev20250201+cu124,135ms,164.95788407325745ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,8021,644ms,, +,None,,8411176448,0,220ms,54779ms,243ms,,209ms,568.1692686080933s,2.7.0.dev20250201+cu124,1.7600388744181994img/s,1,1564ms,mps_ao_ppb_None_fast_cold,577.6140518188477,561ms,1,8,130ms,mps,,214ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_inductor_cache_dir'},,,332350ms,,0.22.0.dev20250201+cu124,115ms,568.1692686080933ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,8021,332350ms,, +,None,,8411176448,0,221ms,1345ms,240ms,0.9983834705352783,97ms,165.37928342819214s,2.7.0.dev20250201+cu124,6.0467065721336315img/s,1,580ms,mps_ao_ppb_None_fast,170.9393391609192,155ms,1,8,109ms,mps,,144ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_inductor_cache_dir'},0.0,,9522ms,,0.22.0.dev20250201+cu124,126ms,165.37928342819214ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,8021,9522ms,, +,,,1390578176,,,,,,,206.4340798854828s,2.7.0.dev20250201+cu124,0.0img/s,1,,mps_ao_ppb_None_save_export,217.42104578018188,,1,1,,mps,0,,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_inductor_cache_dir'},,,,,0.22.0.dev20250201+cu124,,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,1326,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast +,,,7556661248,0,218ms,322ms,236ms,0.998383426964283,104ms,138.59291863441467s,2.7.0.dev20250201+cu124,7.215375863739731img/s,1,116ms,mps_ao_ppb_None_load_export_cold,143.01005744934082,131ms,1,7,112ms,mps,,122ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_load_export_inductor_cache_dir'},0.0,,579ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast,0.22.0.dev20250201+cu124,115ms,138.59291863441467ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,7206,579ms,, +,,,7556661248,0,218ms,258ms,237ms,0.998383426964283,97ms,136.831298828125s,2.7.0.dev20250201+cu124,7.308269442476818img/s,1,116ms,mps_ao_ppb_None_load_export,141.67460775375366,129ms,1,7,111ms,mps,,120ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_load_export_inductor_cache_dir'},0.0,,589ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast,0.22.0.dev20250201+cu124,114ms,136.831298828125ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,7206,589ms,, +,,,7581827072,0,190ms,374ms,216ms,0.9984678273200989,170ms,149.05044078826904s,2.7.0.dev20250201+cu124,6.70913815961492img/s,1,187ms,mps_ao_ppb_None_load_export_gpu_preproc,153.32005190849304,142ms,1,7,181ms,mps,,143ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_load_export_inductor_cache_dir'},0.0,,596ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast,0.22.0.dev20250201+cu124,135ms,149.05044078826904ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,7230,596ms,, +,None,,7556661248,0,208ms,54466ms,226ms,0.9983833708167076,188ms,287.1738612651825s,2.7.0.dev20250201+cu124,3.482211074484173img/s,1,131ms,mps_ao_ppb_None_fast_export_cold,295.3504989147186,278ms,1,7,108ms,mps,,140ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_inductor_cache_dir'},0.0,,62539ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast,0.22.0.dev20250201+cu124,109ms,287.1738612651825ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,7206,62539ms,, +,None,,7556661248,0,218ms,1720ms,230ms,0.9983833900690079,195ms,141.05165219306946s,2.7.0.dev20250201+cu124,7.089601464796843img/s,1,230ms,mps_ao_ppb_None_fast_export,147.43897795677185,133ms,1,7,216ms,mps,,222ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_inductor_cache_dir'},0.0,,3561ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast,0.22.0.dev20250201+cu124,111ms,141.05165219306946ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,7206,3561ms,, +,None,,7581827072,0,185ms,1572ms,197ms,0.9984678581357003,94ms,148.53872227668762s,2.7.0.dev20250201+cu124,6.73225125861302img/s,1,107ms,mps_ao_ppb_None_fast_export_gpu_preproc,154.97156023979187,141ms,1,7,105ms,mps,,112ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_inductor_cache_dir'},0.0,,4246ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast,0.22.0.dev20250201+cu124,127ms,148.53872227668762ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,7230,4246ms,, +None,None,,4427842560,0,74ms,63302ms,84ms,0.9964296479523181,22ms,723.8993864059448s,2.7.0.dev20250201+cu124,1.3814074424967462img/s,1,1071ms,mps_ao_ppb_None_fast_furious_cold,733.4108500480652,716ms,1,4,29ms,mps,,37ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_furious_inductor_cache_dir'},0.0,,581345ms,,0.22.0.dev20250201+cu124,49ms,723.8993864059448ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,4222,581345ms,, +None,None,,4427842560,0,74ms,1300ms,85ms,0.9964293534457683,20ms,58.8767945766449s,2.7.0.dev20250201+cu124,16.9846202937936img/s,1,350ms,mps_ao_ppb_None_fast_furious,64.73449230194092,51ms,1,4,29ms,mps,,30ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_furious_inductor_cache_dir'},0.0,,8402ms,,0.22.0.dev20250201+cu124,34ms,58.8767945766449ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,4222,8402ms,, +None,,,903450624,,,,,,,315.72570967674255s,2.7.0.dev20250201+cu124,0.0img/s,1,,mps_ao_ppb_None_save_export_furious,324.74191069602966,,1,0,,mps,0,,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_furious_inductor_cache_dir'},,,,,0.22.0.dev20250201+cu124,,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,861,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious +None,,,3998911488,0,82ms,301ms,90ms,0.9955771351754665,41ms,57.82986092567444s,2.7.0.dev20250201+cu124,17.292104528579888img/s,1,38ms,mps_ao_ppb_None_load_export_furious_cold,62.62674617767334,51ms,1,3,37ms,mps,,40ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_load_export_furious_inductor_cache_dir'},0.0,,754ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,46ms,57.82986092567444ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3813,754ms,, +None,,,3998911488,0,88ms,252ms,97ms,0.9955771351754665,32ms,65.55874681472778s,2.7.0.dev20250201+cu124,15.25349474458456img/s,1,80ms,mps_ao_ppb_None_load_export_furious,70.35485363006592,58ms,1,3,39ms,mps,,40ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_load_export_furious_inductor_cache_dir'},0.0,,875ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,53ms,65.55874681472778ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3813,875ms,, +None,,,4024077312,0,45ms,285ms,56ms,0.9959434471726417,29ms,41.67199182510376s,2.7.0.dev20250201+cu124,23.996933100701625img/s,1,35ms,mps_ao_ppb_None_load_export_furious_gpu_preproc,46.09472918510437,35ms,1,3,35ms,mps,,36ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_load_export_furious_inductor_cache_dir'},0.0,,653ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,32ms,41.67199182510376ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3837,653ms,, +None,None,,3998911488,0,68ms,51237ms,77ms,0.9966195167303086,20ms,211.8625111579895s,2.7.0.dev20250201+cu124,4.720042231795708img/s,1,27ms,mps_ao_ppb_None_fast_export_furious_cold,218.6763949394226,204ms,1,3,30ms,mps,,66ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_furious_inductor_cache_dir'},0.0,,79408ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,32ms,211.8625111579895ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3813,79408ms,, +None,None,,3998911488,0,70ms,1746ms,78ms,0.9966195802688599,59ms,51.70280361175537s,2.7.0.dev20250201+cu124,19.341310918246524img/s,1,43ms,mps_ao_ppb_None_fast_export_furious,57.28682208061218,44ms,1,3,34ms,mps,,70ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_furious_inductor_cache_dir'},0.0,,3842ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,35ms,51.70280361175537ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3813,3842ms,, +None,None,,3998911488,0,65ms,6664ms,75ms,0.9956195802688599,20ms,59.52086091041565s,2.7.0.dev20250201+cu124,16.8008322578716img/s,1,56ms,mps_ao_ppb_None_fast_export_furious_recompiles,64.74269723892212,52ms,1,3,27ms,mps,,29ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_furious_inductor_cache_dir'},0.0,None,11728ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,30ms,59.52086091041565ms,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3813,11728ms,, +None,None,,4024077312,0,37ms,1743ms,46ms,0.9960403459072114,19ms,37.689289808273315s,2.7.0.dev20250201+cu124,26.5327366232432img/s,1,26ms,mps_ao_ppb_None_fast_export_furious_gpu_preproc,42.8827166557312,31ms,1,3,27ms,mps,,30ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_furious_inductor_cache_dir'},0.0,,3914ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,23ms,37.689289808273315ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3837,3914ms,, +None,None,,4024077312,0,35ms,1672ms,43ms,0.9950685520768165,22ms,44.08118724822998s,2.7.0.dev20250201+cu124,22.685414400678457img/s,1,26ms,mps_ao_ppb_None_fast_export_furious_gpu_preproc_recompiles,50.419389486312866,36ms,1,3,26ms,mps,,31ms,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/mps_fast_export_furious_inductor_cache_dir'},0.0,None,9520ms,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/exported_models/mps_ao_fast_furious,0.22.0.dev20250201+cu124,23ms,44.08118724822998ms,None,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_20/amg_baseline_annotations,3837,9520ms,, diff --git a/torchao/_models/sam2/automatic_mask_generator.py b/torchao/_models/sam2/automatic_mask_generator.py index 665a211035..6f4f1d3e7b 100644 --- a/torchao/_models/sam2/automatic_mask_generator.py +++ b/torchao/_models/sam2/automatic_mask_generator.py @@ -538,11 +538,11 @@ def _process_batch_fullgraph( ] image_embed_input = image_embed[-1].unsqueeze(0).clone() low_res_masks, iou_preds = self.predictor._predict_masks( - high_res_feats_input, - image_embed_input, - image_pe, - in_points[:, None, :], - in_labels[:, None], + [t.contiguous() for t in high_res_feats_input], + image_embed_input.contiguous(), + image_pe.contiguous(), + in_points[:, None, :].contiguous(), + in_labels[:, None].contiguous(), boxes=None, mask_input=None, multimask_output=self.multimask_output, diff --git a/torchao/_models/sam2/modeling/sam/prompt_encoder.py b/torchao/_models/sam2/modeling/sam/prompt_encoder.py index 6bb58d62ba..94b7fda8b2 100644 --- a/torchao/_models/sam2/modeling/sam/prompt_encoder.py +++ b/torchao/_models/sam2/modeling/sam/prompt_encoder.py @@ -186,6 +186,12 @@ def forward( torch.Tensor: dense embeddings for the masks, in the shape Bx(embed_dim)x(embed_H)x(embed_W) """ + # if boxes is not None: + # raise ValueError("Currently do not support boxes. " + # "Please create an issue on pytorch/ao.") + # if masks is not None: + # raise ValueError("Currently do not support masks. " + # "Please create an issue on pytorch/ao.") bs = self._get_batch_size(points, boxes, masks) sparse_embeddings = torch.empty( (bs, 0, self.embed_dim), device=self._get_device() diff --git a/torchao/_models/sam2/sam2_image_predictor.py b/torchao/_models/sam2/sam2_image_predictor.py index 02d9aed547..a4aa1c668c 100644 --- a/torchao/_models/sam2/sam2_image_predictor.py +++ b/torchao/_models/sam2/sam2_image_predictor.py @@ -430,12 +430,15 @@ def _predict( for feat_level in high_res_feats ] image_embed_input = image_embed[img_idx].unsqueeze(0).clone() + assert boxes is None + assert mask_input is None + assert multimask_output is True low_res_masks, iou_predictions = self._predict_masks( - high_res_feats_input, - image_embed_input, - image_pe, - point_coords, - point_labels, + [t.contiguous() for t in high_res_feats_input], + image_embed_input.contiguous(), + image_pe.contiguous(), + point_coords.contiguous(), + point_labels.contiguous(), boxes=boxes, mask_input=mask_input, multimask_output=multimask_output, @@ -498,6 +501,10 @@ def _predict_masks( # ] high_res_features = high_res_feats_input with torch.autograd.profiler.record_function("self.model.sam_mask_decoder"): + # if not multimask_output: + # raise ValueError("Expected multimask_output.") + # if batched_mode: + # raise ValueError("Did not expected repeat_image.") low_res_masks, iou_predictions, _, _ = self.model.sam_mask_decoder( # image_embeddings=self._features["image_embed"][img_idx].unsqueeze(0).clone(), # image_embeddings=image_embed[img_idx].unsqueeze(0).clone(), diff --git a/torchao/_models/sam2/utils/transforms.py b/torchao/_models/sam2/utils/transforms.py index 95970ba108..c616233050 100644 --- a/torchao/_models/sam2/utils/transforms.py +++ b/torchao/_models/sam2/utils/transforms.py @@ -27,11 +27,10 @@ def __init__( self.mean = [0.485, 0.456, 0.406] self.std = [0.229, 0.224, 0.225] self.to_tensor = ToTensor() - self.transforms = torch.jit.script( - nn.Sequential( - Resize((self.resolution, self.resolution)), - Normalize(self.mean, self.std), - ) + # self.transforms = torch.jit.script( + self.transforms = nn.Sequential( + Resize((self.resolution, self.resolution)), + Normalize(self.mean, self.std), ) def __call__(self, x): From 4df4d031adbadbbe99451241f82fe3ed9d446a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleksandar=20Samard=C5=BEi=C4=87?= <115986737+alexsamardzic@users.noreply.github.com> Date: Wed, 5 Feb 2025 23:27:54 +0100 Subject: [PATCH 3/6] Moved CUTLASS pin to v3.7.0 (#1672) --- third_party/cutlass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/cutlass b/third_party/cutlass index bf9da7b76c..b78588d163 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit bf9da7b76c766d7ee7d536afc77880a4ef1f1156 +Subproject commit b78588d1630aa6643bf021613717bafb705df4ef From bc1530b80a24db8c2bb9225709026560ebf90531 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 5 Feb 2025 15:55:29 -0800 Subject: [PATCH 4/6] Q dq layout (#1642) * add q-dq layout for ET * up * up * up * up * up * up * up --- .../workflows/torchao_experimental_test.yml | 3 +- torchao/experimental/q_dq_layout.py | 61 ++++++ ...est_int8_dynamic_activation_intx_weight.py | 186 ++++++++++++++++++ ...8_dynamic_activation_intx_weight_layout.py | 154 --------------- 4 files changed, 249 insertions(+), 155 deletions(-) create mode 100644 torchao/experimental/q_dq_layout.py create mode 100644 torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py delete mode 100644 torchao/experimental/tests/test_packed_linear_int8_dynamic_activation_intx_weight_layout.py diff --git a/.github/workflows/torchao_experimental_test.yml b/.github/workflows/torchao_experimental_test.yml index c1419bccc6..08f494c71d 100644 --- a/.github/workflows/torchao_experimental_test.yml +++ b/.github/workflows/torchao_experimental_test.yml @@ -35,8 +35,9 @@ jobs: conda activate venv pip install --extra-index-url "https://download.pytorch.org/whl/nightly/cpu" torch=="2.6.0.dev20250104" pip install numpy + pip install pytest USE_CPP=1 pip install . - name: Run tests run: | conda activate venv - python torchao/experimental/tests/test_packed_linear_int8_dynamic_activation_intx_weight_layout.py + pytest torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py diff --git a/torchao/experimental/q_dq_layout.py b/torchao/experimental/q_dq_layout.py new file mode 100644 index 0000000000..b9337ae027 --- /dev/null +++ b/torchao/experimental/q_dq_layout.py @@ -0,0 +1,61 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch + +from torchao.dtypes.affine_quantized_tensor import ( + AffineQuantizedTensor, + register_layout, +) +from torchao.dtypes.affine_quantized_tensor_ops import ( + register_aqt_quantized_linear_dispatch, +) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + +import sys + +handler = logging.StreamHandler(sys.stdout) +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +from torchao.dtypes.utils import PlainLayout + + +class QDQLayout(PlainLayout): + pass + + +from torchao.dtypes.uintx.plain_layout import PlainAQTTensorImpl + + +@register_layout(QDQLayout) +class _Impl(PlainAQTTensorImpl): + pass + + +def _linear_check(input_tensor, weight_tensor, bias): + layout = weight_tensor.tensor_impl.get_layout() + return isinstance(layout, QDQLayout) + + +def _linear_impl(input_tensor, weight_tensor, bias): + if isinstance(input_tensor, AffineQuantizedTensor): + input_tensor = input_tensor.dequantize() + if isinstance(weight_tensor, AffineQuantizedTensor): + weight_tensor = weight_tensor.dequantize() + return torch.nn.functional.linear(input_tensor, weight_tensor, bias) + + +register_aqt_quantized_linear_dispatch( + _linear_check, + _linear_impl, +) diff --git a/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py new file mode 100644 index 0000000000..63a8892425 --- /dev/null +++ b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py @@ -0,0 +1,186 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import itertools +import tempfile +import unittest + +import torch +from torch.testing import FileCheck + +from torchao.dtypes import PlainLayout +from torchao.experimental.packed_linear_int8_dynamic_activation_intx_weight_layout import ( + PackedLinearInt8DynamicActivationIntxWeightLayout, +) +from torchao.experimental.q_dq_layout import QDQLayout +from torchao.experimental.quant_api import ( + int8_dynamic_activation_intx_weight, +) +from torchao.quantization.granularity import ( + PerGroup, + PerRow, +) +from torchao.quantization.quant_api import quantize_ +from torchao.utils import unwrap_tensor_subclass + + +class TestInt8DynamicActivationIntxWeight(unittest.TestCase): + def test_accuracy(self): + """ + Checks the accuracy of different layouts by comparing the results to PlainLayout() + """ + m = 1 + n = 1071 + k = 4096 + activations = torch.randn(m, k) + model = torch.nn.Sequential(*[torch.nn.Linear(k, n, bias=False)]) + + reference_layout = PlainLayout() + test_layouts = [ + PackedLinearInt8DynamicActivationIntxWeightLayout(), + QDQLayout(), + ] + test_weight_dtypes = [ + torch.int1, + torch.int2, + torch.int3, + torch.int4, + torch.int5, + torch.int6, + torch.int7, + torch.int8, + ] + test_has_weight_zeros = [True, False] + test_granularities = [PerGroup(128), PerRow()] + for layout, weight_dtype, has_weight_zeros, granularity in itertools.product( + test_layouts, test_weight_dtypes, test_has_weight_zeros, test_granularities + ): + quantized_model = copy.deepcopy(model) + quantize_( + quantized_model, + int8_dynamic_activation_intx_weight( + weight_dtype=weight_dtype, + granularity=granularity, + has_weight_zeros=has_weight_zeros, + layout=layout, + ), + ) + + quantized_model_reference = copy.deepcopy(model) + quantize_( + quantized_model_reference, + int8_dynamic_activation_intx_weight( + weight_dtype=weight_dtype, + granularity=granularity, + has_weight_zeros=has_weight_zeros, + layout=reference_layout, + ), + ) + + with torch.no_grad(): + result = quantized_model(activations) + expected_result = quantized_model_reference(activations) + self.assertTrue(torch.allclose(result, expected_result, atol=1e-6)) + + def test_export_compile_aoti_PackedLinearInt8DynamicActivationIntxWeightLayout( + self, + ): + """ + Checks that models quantized with PackedLinearInt8DynamicActivationIntxWeightLayout() work with + torch.export.export, torch.compile, and AOTI. + """ + granularity = PerRow() + m = 3 + k0 = 512 + k1 = 256 + k2 = 128 + k3 = 1024 + weight_dtype = torch.int4 + has_weight_zeros = True + layers = [ + torch.nn.Linear(k0, k1, bias=False), + torch.nn.Linear(k1, k2, bias=False), + torch.nn.Linear(k2, k3, bias=False), + ] + model = torch.nn.Sequential(*layers) + activations = torch.randn(2, 1, m, k0, dtype=torch.float32) + + quantize_( + model, + int8_dynamic_activation_intx_weight( + weight_dtype=weight_dtype, + granularity=granularity, + has_weight_zeros=has_weight_zeros, + layout=PackedLinearInt8DynamicActivationIntxWeightLayout(), + ), + ) + eager_results = model(activations) + + unwrapped_model = copy.deepcopy(model) + unwrap_tensor_subclass(model) + + # Export + exported = torch.export.export(model, (activations,), strict=True) + exported_results = exported.module()(activations) + self.assertTrue(torch.allclose(eager_results, exported_results)) + + # Compile + compiled = torch.compile(unwrapped_model) + with torch.no_grad(): + compiled_results = compiled(activations) + self.assertTrue(torch.allclose(eager_results, compiled_results)) + + # AOTI + with tempfile.TemporaryDirectory() as tmpdirname: + package_path = f"{tmpdirname}/model.pt2" + torch._inductor.aoti_compile_and_package( + exported, package_path=package_path + ) + fn = torch._inductor.aoti_load_package(package_path) + aoti_results = fn(activations) + self.assertTrue(torch.allclose(eager_results, aoti_results)) + + def test_export_QDQLayout(self): + """ + Checks that models quantized with TestQDQLayout() export as expected + """ + granularity = PerGroup(64) + weight_dtype = torch.int4 + has_weight_zeros = False + layers = [ + torch.nn.Linear(512, 256, bias=False), + ] + model = torch.nn.Sequential(*layers) + activations = torch.randn(1, 512, dtype=torch.float32) + + quantize_( + model, + int8_dynamic_activation_intx_weight( + weight_dtype=weight_dtype, + granularity=granularity, + has_weight_zeros=has_weight_zeros, + layout=QDQLayout(), + ), + ) + eager_results = model(activations) + + unwrap_tensor_subclass(model) + exported = torch.export.export(model, (activations,), strict=True) + exported_results = exported.module()(activations) + self.assertTrue(torch.allclose(eager_results, exported_results)) + + expected_lines = [ + "torch.ops.quant.choose_qparams_affine.default(input_1, 'ASYMMETRIC', [1, 512], torch.int32, -128, 127, None, torch.float32, torch.int32)", + "torch.ops.quant.quantize_affine.default(input_1, [1, 512], getitem, getitem_1, torch.int32, -128, 127)", + "torch.ops.quant.dequantize_affine.default(quantize_affine, [1, 512], getitem, getitem_1, torch.int32, -128, 127)", + "torch.ops.quant.dequantize_affine.default(p_fn_0_parametrizations_weight_original0, [1, 64], p_fn_0_parametrizations_weight_original1, None, torch.int32, -8, 7, 'NONE')", + "torch.ops.aten.linear.default(dequantize_affine, dequantize_affine_1)", + ] + for line in expected_lines: + FileCheck().check_count(line, 1, exactly=True).run( + exported.graph_module.code + ) diff --git a/torchao/experimental/tests/test_packed_linear_int8_dynamic_activation_intx_weight_layout.py b/torchao/experimental/tests/test_packed_linear_int8_dynamic_activation_intx_weight_layout.py deleted file mode 100644 index 284ef4b2a8..0000000000 --- a/torchao/experimental/tests/test_packed_linear_int8_dynamic_activation_intx_weight_layout.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -import copy -import tempfile -import unittest - -import torch - -from torchao.dtypes import PlainLayout -from torchao.experimental.packed_linear_int8_dynamic_activation_intx_weight_layout import ( - PackedLinearInt8DynamicActivationIntxWeightLayout, -) -from torchao.experimental.quant_api import ( - int8_dynamic_activation_intx_weight, -) -from torchao.quantization.granularity import ( - PerGroup, - PerRow, -) -from torchao.quantization.quant_api import quantize_ -from torchao.utils import unwrap_tensor_subclass - - -class TestPackedLinearInt8DynamicActivationIntxWeightLayout(unittest.TestCase): - def test_accuracy(self): - """ - Checks the accuracy of PackedLinearInt8DynamicActivationIntxWeightLayout() by comparing - its results to the results of a reference model that uses PlainLayout() - """ - granularity = PerGroup(128) - m = 1 - n = 1071 - k = 4096 - activations = torch.randn(m, k) - model = torch.nn.Sequential(*[torch.nn.Linear(k, n, bias=False)]) - - for weight_dtype in [ - torch.int1, - torch.int2, - torch.int3, - torch.int4, - torch.int5, - torch.int6, - torch.int7, - torch.int8, - ]: - for has_weight_zeros in [True, False]: - print( - f"Testing weight_dtype={weight_dtype}, has_weight_zeros={has_weight_zeros}" - ) - quantized_model = copy.deepcopy(model) - quantize_( - quantized_model, - int8_dynamic_activation_intx_weight( - weight_dtype=weight_dtype, - granularity=granularity, - has_weight_zeros=has_weight_zeros, - layout=PackedLinearInt8DynamicActivationIntxWeightLayout(), # default - ), - ) - - quantized_model_reference = copy.deepcopy(model) - quantize_( - quantized_model_reference, - int8_dynamic_activation_intx_weight( - weight_dtype=weight_dtype, - granularity=granularity, - has_weight_zeros=has_weight_zeros, - layout=PlainLayout(), - ), - ) - - with torch.no_grad(): - result = quantized_model(activations) - expected_result = quantized_model_reference(activations) - - num_mismatch_at_low_tol = 0 - num_total = result.reshape(-1).shape[0] - for i in range(num_total): - actual_val = result.reshape(-1)[i] - expected_val = expected_result.reshape(-1)[i] - self.assertTrue(torch.allclose(actual_val, expected_val, atol=1e-6)) - if not torch.allclose(actual_val, expected_val): - num_mismatch_at_low_tol += 1 - - # Assert at most 5% of entries are not close at a low tolerance - self.assertTrue(num_mismatch_at_low_tol / num_total <= 0.05) - - def test_export_compile_aoti(self): - """ - Checks that models quantized with PackedLinearInt8DynamicActivationIntxWeightLayout() work with - torch.export.export, torch.compile, and AOTI. - """ - granularity = PerRow() - m = 3 - k0 = 512 - k1 = 256 - k2 = 128 - k3 = 1024 - weight_dtype = torch.int4 - has_weight_zeros = True - layers = [ - torch.nn.Linear(k0, k1, bias=False), - torch.nn.Linear(k1, k2, bias=False), - torch.nn.Linear(k2, k3, bias=False), - ] - model = torch.nn.Sequential(*layers) - activations = torch.randn(2, 1, m, k0, dtype=torch.float32) - - print("Quantizing model") - quantize_( - model, - int8_dynamic_activation_intx_weight( - weight_dtype=weight_dtype, - granularity=granularity, - has_weight_zeros=has_weight_zeros, - layout=PackedLinearInt8DynamicActivationIntxWeightLayout(), - ), - ) - eager_results = model(activations) - - unwrapped_model = copy.deepcopy(model) - unwrap_tensor_subclass(model) - - print("Exporting quantized model") - exported = torch.export.export(model, (activations,), strict=True) - exported_results = exported.module()(activations) - self.assertTrue(torch.allclose(eager_results, exported_results)) - - print("Compiling quantized model") - compiled = torch.compile(unwrapped_model) - with torch.no_grad(): - compiled_results = compiled(activations) - self.assertTrue(torch.allclose(eager_results, compiled_results)) - - with tempfile.TemporaryDirectory() as tmpdirname: - package_path = f"{tmpdirname}/model.pt2" - print("Exporting quantized model with AOTI") - torch._inductor.aoti_compile_and_package( - exported, package_path=package_path - ) - - print("Running quantized model in AOTI") - fn = torch._inductor.aoti_load_package(package_path) - aoti_results = fn(activations) - self.assertTrue(torch.allclose(eager_results, aoti_results)) - - -if __name__ == "__main__": - unittest.main() From c6611be254be9563d045f515d94c20c8c54be8ec Mon Sep 17 00:00:00 2001 From: Apurva Jain Date: Wed, 5 Feb 2025 16:01:48 -0800 Subject: [PATCH 5/6] Remove duplicate definitions of fill_defaults (#1674) --- torchao/dtypes/uintx/uint4_layout.py | 27 ++------------------------- torchao/prototype/dtypes/uint2.py | 11 ++--------- 2 files changed, 4 insertions(+), 34 deletions(-) diff --git a/torchao/dtypes/uintx/uint4_layout.py b/torchao/dtypes/uintx/uint4_layout.py index 204aefcf3c..0b6512640e 100644 --- a/torchao/dtypes/uintx/uint4_layout.py +++ b/torchao/dtypes/uintx/uint4_layout.py @@ -3,6 +3,8 @@ import torch.utils._pytree as pytree from torch.library import Library, impl +from torchao.utils import fill_defaults + def down_size(size): assert size[-1] % 2 == 0, f"{size} last dim not divisible by two" @@ -13,31 +15,6 @@ def up_size(size): return (*size[:-1], size[-1] * 2) -def fill_defaults(args, n, defaults_tail): - """ - __torch_dispatch__ doesn't guarantee the number of arguments you are - passed (e.g., defaulted arguments are not passed); but usually it is - convenient to pad out the arguments list with defaults. This function - helps you do that. - Args: - args: the list of positional arguments passed to __torch_dispatch__ - n: the number of arguments you are expecting to get - defaults_tail: default values for the arguments, starting from the - end of the list - Example: - >>> fill_defaults([1, 2, 3], 5, [3, 4, 5]) - [1, 2, 3, 4, 5] - >>> fill_defaults([1, 2, 3], 5, [None, None, None]) - [1, 2, 3, None, None]] - """ - if n - len(defaults_tail) > len(args): - raise RuntimeError("not enough defaults to fill arguments") - r = list(args) - for i in range(len(args), n): - r.append(defaults_tail[i - n + len(defaults_tail)]) - return r - - # from # https://github.com/drisspg/transformer_nuggets/blob/9ad3a7fc552a954eb702ade0e276b8d8e09c3db6/transformer_nuggets/quant/qlora.py#L233 diff --git a/torchao/prototype/dtypes/uint2.py b/torchao/prototype/dtypes/uint2.py index 9c14d8ae72..d54e541751 100644 --- a/torchao/prototype/dtypes/uint2.py +++ b/torchao/prototype/dtypes/uint2.py @@ -4,16 +4,9 @@ import torch import torch._prims_common as utils -UINT2_OPS_TABLE: Dict[Any, Any] = {} - +from torchao.utils import fill_defaults -def fill_defaults(args, n, defaults_tail): - if n - len(defaults_tail) > len(args): - raise RuntimeError("not enough defaults to fill arguments") - r = list(args) - for i in range(len(args), n): - r.append(defaults_tail[i - n + len(defaults_tail)]) - return r +UINT2_OPS_TABLE: Dict[Any, Any] = {} def implements(aten_ops): From 867a91f930d16f1a79eda3c2d505851e3817b786 Mon Sep 17 00:00:00 2001 From: HDCharles <39544797+HDCharles@users.noreply.github.com> Date: Wed, 5 Feb 2025 23:32:29 -0500 Subject: [PATCH 6/6] update notify in build_wheels_linux.yml (#1676) remove debug code --- .github/workflows/build_wheels_linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml index 8b966059f3..fd16bf37a8 100644 --- a/.github/workflows/build_wheels_linux.yml +++ b/.github/workflows/build_wheels_linux.yml @@ -70,7 +70,7 @@ jobs: password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }} from: torchao.notify@gmail.com to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }} - subject: breakbutterflyScheduled Build Failure for TorchAO + subject: Scheduled Build Failure for TorchAO body: | Build Failure Notification for TorchAO