Skip to content

Commit 348aaba

Browse files
Update Gaudi runner image to latest SynapseAI and enable previously disabled tests (#3653)
* update synapse and add tp tests * only skip regional compile speedup check * pass sdp test on hpu
1 parent 3b13453 commit 348aaba

File tree

5 files changed

+23
-17
lines changed

5 files changed

+23
-17
lines changed

.github/workflows/gaudi3_scheduled.yml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
group: itac-bm-emr-gaudi3-dell-2gaudi
1616

1717
container:
18-
image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
18+
image: docker://vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
1919
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
2020
env:
2121
OMPI_MCA_btl_vader_single_copy_mechanism: none
@@ -66,15 +66,20 @@ jobs:
6666
run: |
6767
make test_big_modeling
6868
69+
- name: Run DeepSpeed integration tests
70+
if: ${{ !cancelled() && (success() || failure()) }}
71+
run: |
72+
make test_deepspeed
73+
6974
- name: Run FSDP integration tests
7075
if: ${{ !cancelled() && (success() || failure()) }}
7176
run: |
7277
make test_fsdp
7378
74-
- name: Run DeepSpeed integration tests
79+
- name: Run TP integration tests
7580
if: ${{ !cancelled() && (success() || failure()) }}
7681
run: |
77-
make test_deepspeed
82+
make test_tp
7883
7984
- name: Run Examples tests
8085
if: ${{ !cancelled() && (success() || failure()) }}

Makefile

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,23 @@ style:
2323
doc-builder style src/accelerate docs/source --max_len 119
2424

2525
# Run tests for the library
26-
test_big_modeling:
27-
python -m pytest -s -v ./tests/test_big_modeling.py ./tests/test_modeling_utils.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_big_modeling.log",)
28-
2926
test_core:
30-
python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py --ignore=./tests/deepspeed --ignore=./tests/test_big_modeling.py \
31-
--ignore=./tests/fsdp --ignore=./tests/tp --ignore=./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
27+
python -m pytest -s -v ./tests/ \
28+
--ignore=./tests/test_big_modeling.py \
29+
--ignore=./tests/test_modeling_utils.py \
30+
--ignore=./tests/test_examples.py \
31+
--ignore=./tests/test_cli.py \
32+
--ignore=./tests/deepspeed \
33+
--ignore=./tests/fsdp \
34+
--ignore=./tests/tp \
35+
$(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
3236

3337
test_cli:
3438
python -m pytest -s -v ./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_cli.log",)
3539

40+
test_big_modeling:
41+
python -m pytest -s -v ./tests/test_big_modeling.py ./tests/test_modeling_utils.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_big_modeling.log",)
42+
3643
test_deepspeed:
3744
python -m pytest -s -v ./tests/deepspeed $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_deepspeed.log",)
3845

src/accelerate/test_utils/scripts/test_merge_weights.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,6 @@ def mock_training(accelerator, model):
7979

8080
def check_weights(operation, state_1, state_2):
8181
for weight_1, weight_2 in zip(state_1.values(), state_2.values()):
82-
if str(weight_1.device) != torch_device:
83-
weight_1 = weight_1.to(torch_device)
84-
if str(weight_2.device) != torch_device:
85-
weight_2 = weight_2.to(torch_device)
8682
if operation == "same":
8783
assert torch.allclose(weight_1, weight_2)
8884
else:
@@ -91,15 +87,15 @@ def check_weights(operation, state_1, state_2):
9187

9288
def check_safetensors_weights(path, model):
9389
safe_state_dict = load_file(path / "model.safetensors")
94-
safe_loaded_model = TinyModel()
90+
safe_loaded_model = TinyModel().to(torch_device)
9591
check_weights("diff", model.state_dict(), safe_loaded_model.state_dict())
9692
safe_loaded_model.load_state_dict(safe_state_dict)
9793
check_weights("same", model.state_dict(), safe_loaded_model.state_dict())
9894

9995

10096
def check_pytorch_weights(path, model):
10197
nonsafe_state_dict = torch.load(path / "pytorch_model.bin", weights_only=True)
102-
nonsafe_loaded_model = TinyModel()
98+
nonsafe_loaded_model = TinyModel().to(torch_device)
10399
check_weights("diff", model.state_dict(), nonsafe_loaded_model.state_dict())
104100
nonsafe_loaded_model.load_state_dict(nonsafe_state_dict)
105101
check_weights("same", model.state_dict(), nonsafe_loaded_model.state_dict())

tests/test_compile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
backend = "inductor"
3535

3636

37-
@require_non_hpu
3837
@require_huggingface_suite
3938
class RegionalCompilationTester(unittest.TestCase):
4039
def _get_model_and_inputs(self):
@@ -109,6 +108,7 @@ def test_regional_compilation_cold_start(self):
109108
release_memory(model, full_compilation_model, regional_compilation_model)
110109

111110
@slow
111+
@require_non_hpu
112112
@require_non_cpu
113113
@require_huggingface_suite
114114
def test_regional_compilation_inference_speedup(self):

tests/test_multigpu.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
path_in_accelerate_package,
2929
require_huggingface_suite,
3030
require_multi_device,
31-
require_non_hpu,
3231
require_non_torch_xla,
3332
require_pippy,
3433
require_torchvision,
@@ -70,7 +69,6 @@ def test_pad_across_processes(self):
7069
execute_subprocess_async(cmd)
7170

7271
@run_first
73-
@require_non_hpu # Synapse detected a device critical error that requires a restart
7472
@require_multi_device
7573
def test_multi_device_merge_fsdp_weights(self):
7674
print(f"Found {device_count} {torch_device} devices.")

0 commit comments

Comments
 (0)