Update Gaudi runner image to latest SynapseAI and enable previously disabled tests (#3653)

IlyasMoutawwakil · web-flow · commit 348aabaaaf1b · 2025-07-16T14:33:36.000+02:00
* update synapse and add tp tests

* only skip regional compile speedup check

* pass sdp test on hpu
diff --git a/.github/workflows/gaudi3_scheduled.yml b/.github/workflows/gaudi3_scheduled.yml
@@ -15,7 +15,7 @@ jobs:
       group: itac-bm-emr-gaudi3-dell-2gaudi
 
     container:
-      image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      image: docker://vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
       options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
       env:
         OMPI_MCA_btl_vader_single_copy_mechanism: none
@@ -66,15 +66,20 @@ jobs:
         run: |
           make test_big_modeling
 
+      - name: Run DeepSpeed integration tests
+        if: ${{ !cancelled() && (success() || failure()) }}
+        run: |
+          make test_deepspeed
+
       - name: Run FSDP integration tests
         if: ${{ !cancelled() && (success() || failure()) }}
         run: |
           make test_fsdp
 
-      - name: Run DeepSpeed integration tests
+      - name: Run TP integration tests
         if: ${{ !cancelled() && (success() || failure()) }}
         run: |
-          make test_deepspeed
+          make test_tp
 
       - name: Run Examples tests
         if: ${{ !cancelled() && (success() || failure()) }}
diff --git a/Makefile b/Makefile
@@ -23,16 +23,23 @@ style:
 	doc-builder style src/accelerate docs/source --max_len 119
 	
 # Run tests for the library
-test_big_modeling:
-	python -m pytest -s -v ./tests/test_big_modeling.py ./tests/test_modeling_utils.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_big_modeling.log",)
-
 test_core:
-	python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py --ignore=./tests/deepspeed --ignore=./tests/test_big_modeling.py \
-	--ignore=./tests/fsdp --ignore=./tests/tp --ignore=./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
+	python -m pytest -s -v ./tests/ \
+	--ignore=./tests/test_big_modeling.py \
+	--ignore=./tests/test_modeling_utils.py \
+	--ignore=./tests/test_examples.py \
+	--ignore=./tests/test_cli.py \
+	--ignore=./tests/deepspeed \
+	--ignore=./tests/fsdp \
+	--ignore=./tests/tp \
+	$(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
 
 test_cli:
 	python -m pytest -s -v ./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_cli.log",)
 
+test_big_modeling:
+	python -m pytest -s -v ./tests/test_big_modeling.py ./tests/test_modeling_utils.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_big_modeling.log",)
+
 test_deepspeed:
 	python -m pytest -s -v ./tests/deepspeed $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_deepspeed.log",)
 
diff --git a/src/accelerate/test_utils/scripts/test_merge_weights.py b/src/accelerate/test_utils/scripts/test_merge_weights.py
@@ -79,10 +79,6 @@ def mock_training(accelerator, model):
 
 def check_weights(operation, state_1, state_2):
     for weight_1, weight_2 in zip(state_1.values(), state_2.values()):
-        if str(weight_1.device) != torch_device:
-            weight_1 = weight_1.to(torch_device)
-        if str(weight_2.device) != torch_device:
-            weight_2 = weight_2.to(torch_device)
         if operation == "same":
             assert torch.allclose(weight_1, weight_2)
         else:
@@ -91,15 +87,15 @@ def check_weights(operation, state_1, state_2):
 
 def check_safetensors_weights(path, model):
     safe_state_dict = load_file(path / "model.safetensors")
-    safe_loaded_model = TinyModel()
+    safe_loaded_model = TinyModel().to(torch_device)
     check_weights("diff", model.state_dict(), safe_loaded_model.state_dict())
     safe_loaded_model.load_state_dict(safe_state_dict)
     check_weights("same", model.state_dict(), safe_loaded_model.state_dict())
 
 
 def check_pytorch_weights(path, model):
     nonsafe_state_dict = torch.load(path / "pytorch_model.bin", weights_only=True)
-    nonsafe_loaded_model = TinyModel()
+    nonsafe_loaded_model = TinyModel().to(torch_device)
     check_weights("diff", model.state_dict(), nonsafe_loaded_model.state_dict())
     nonsafe_loaded_model.load_state_dict(nonsafe_state_dict)
     check_weights("same", model.state_dict(), nonsafe_loaded_model.state_dict())
diff --git a/tests/test_compile.py b/tests/test_compile.py
@@ -34,7 +34,6 @@
     backend = "inductor"
 
 
-@require_non_hpu
 @require_huggingface_suite
 class RegionalCompilationTester(unittest.TestCase):
     def _get_model_and_inputs(self):
@@ -109,6 +108,7 @@ def test_regional_compilation_cold_start(self):
         release_memory(model, full_compilation_model, regional_compilation_model)
 
     @slow
+    @require_non_hpu
     @require_non_cpu
     @require_huggingface_suite
     def test_regional_compilation_inference_speedup(self):
diff --git a/tests/test_multigpu.py b/tests/test_multigpu.py
@@ -28,7 +28,6 @@
     path_in_accelerate_package,
     require_huggingface_suite,
     require_multi_device,
-    require_non_hpu,
     require_non_torch_xla,
     require_pippy,
     require_torchvision,
@@ -70,7 +69,6 @@ def test_pad_across_processes(self):
             execute_subprocess_async(cmd)
 
     @run_first
-    @require_non_hpu  # Synapse detected a device critical error that requires a restart
     @require_multi_device
     def test_multi_device_merge_fsdp_weights(self):
         print(f"Found {device_count} {torch_device} devices.")