skip fsdp2 test for ROCm

petrex · petrex · commit bb36700376bb · 2025-01-22T23:01:34.000-08:00
diff --git a/test/float8/test_fsdp2/test_fsdp2.py b/test/float8/test_fsdp2/test_fsdp2.py
@@ -43,6 +43,9 @@
 if not is_sm_at_least_89():
     pytest.skip("Unsupported CUDA device capability version", allow_module_level=True)
 
+if torch.version.hip is not None:
+    pytest.skip("ROCm enablement in progress", allow_module_level=True)
+
 
 class TestFloat8Common:
     def broadcast_module(self, module: nn.Module) -> None:
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -903,6 +903,7 @@ def test_aq_float8_dynamic_quant_tensorwise_scaling_subclass(self, device, dtype
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_3, "int4 requires torch nightly.")
     # @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_5, "int4 skipping 2.5+ for now")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_quant_subclass(self, device, dtype):
         if device == "cpu":
             self.skipTest(f"Temporarily skipping for {device}")
@@ -922,6 +923,7 @@ def test_int4_weight_only_quant_subclass(self, device, dtype):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_3, "int4 requires torch nightly.")
     # @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_5, "int4 skipping 2.5+ for now")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_quant_subclass_grouped(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
@@ -1075,6 +1077,7 @@ def test_gemlite_layout(self, device, dtype):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_3, "int4 requires torch nightly.")
     # @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_5, "int4 skipping 2.5+ for now")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
         if device == "cpu":
             self.skipTest(f"Temporarily skipping for {device}")
diff --git a/test/kernel/test_fused_kernels.py b/test/kernel/test_fused_kernels.py
@@ -11,6 +11,8 @@
 import torch
 from galore_test_utils import get_kernel, make_copy, make_data
 
+from torchao.utils import skip_if_rocm
+
 torch.manual_seed(0)
 MAX_DIFF_no_tf32 = 1e-5
 MAX_DIFF_tf32 = 1e-3
@@ -104,6 +106,7 @@ def run_test(kernel, exp_avg, exp_avg2, grad, proj_matrix, params, allow_tf32):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU")
 @pytest.mark.parametrize("kernel, dtype, M, N, rank, allow_tf32", TEST_CONFIGS)
+@skip_if_rocm("ROCm enablement in progress")
 def test_galore_fused_kernels(kernel, dtype, M, N, rank, allow_tf32):
     torch.backends.cuda.matmul.allow_tf32 = allow_tf32
 
diff --git a/test/prototype/test_low_bit_optim.py b/test/prototype/test_low_bit_optim.py
@@ -386,6 +386,7 @@ def world_size(self) -> int:
         not TORCH_VERSION_AT_LEAST_2_5, reason="PyTorch>=2.5 is required."
     )
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_fsdp2(self):
         optim_classes = [low_bit_optim.AdamW8bit, low_bit_optim.AdamW4bit]
         if torch.cuda.get_device_capability() >= (8, 9):

Original file line number	Diff line number	Diff line change
`@@ -386,6 +386,7 @@ def world_size(self) -> int:`
`386`	`386`	`not TORCH_VERSION_AT_LEAST_2_5, reason="PyTorch>=2.5 is required."`
`387`	`387`	`)`
`388`	`388`	`@skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)`
	`389`	`+ @skip_if_rocm("ROCm enablement in progress")`
`389`	`390`	`def test_fsdp2(self):`
`390`	`391`	`optim_classes = [low_bit_optim.AdamW8bit, low_bit_optim.AdamW4bit]`
`391`	`392`	`if torch.cuda.get_device_capability() >= (8, 9):`