Skip to content

Commit da3d8c1

Browse files
authored
Horovod test fix (#1264)
* Fixing horovod test with correct framework versions and ignoring the tests for 2.1 and 2.2 * Metadata correction * Accelerator correction
1 parent c063e5f commit da3d8c1

File tree

2 files changed

+12
-5
lines changed

2 files changed

+12
-5
lines changed

horovod/horovod.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ set -euxo pipefail
2222

2323
readonly DEFAULT_HOROVOD_VERSION="0.21.2"
2424
readonly DEFAULT_TENSORFLOW_VERSION="2.4.1"
25-
readonly DEFAULT_PYTORCH_VERSION="1.11.0"
25+
readonly DEFAULT_PYTORCH_VERSION="1.7.1"
2626
readonly DEFAULT_TORCHVISION_VERSION="0.8.2"
2727
readonly DEFAULT_MXNET_VERSION="1.7.0.post1"
28-
readonly DEFAULT_CUDA_VERSION="11.0"
28+
readonly DEFAULT_CUDA_VERSION="12.4"
2929

3030
HOROVOD_VERSION="$(/usr/share/google/get_metadata_value attributes/horovod-version || echo ${DEFAULT_HOROVOD_VERSION})"
3131
readonly HOROVOD_VERSION

horovod/test_horovod.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import pkg_resources
2+
13
from absl.testing import absltest
24
from absl.testing import parameterized
35

@@ -9,6 +11,7 @@ class HorovodTestCase(DataprocTestCase):
911
INIT_ACTIONS = ["horovod/horovod.sh"]
1012
GPU_INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] + INIT_ACTIONS
1113
GPU_P100 = "type=nvidia-tesla-p100"
14+
GPU_T4 = "type=nvidia-tesla-t4"
1215

1316
TENSORFLOW_TEST_SCRIPT = "scripts/verify_tensorflow.py"
1417
PYTORCH_TEST_SCRIPT = "scripts/verify_pytorch.py"
@@ -26,6 +29,8 @@ def _submit_spark_job(self, script):
2629
def test_horovod_cpu(self, configuration, controller):
2730
if self.getImageOs() == 'rocky':
2831
self.skipTest("Not supported in Rocky Linux-based images")
32+
if self.getImageVersion() > pkg_resources.parse_version("2.0"):
33+
self.skipTest("Not supported in Dataproc image version 2.1 and 2.2")
2934

3035
metadata = ""
3136
if controller == "mpi":
@@ -44,16 +49,18 @@ def test_horovod_cpu(self, configuration, controller):
4449
def test_horovod_gpu(self, configuration, controller):
4550
if self.getImageOs() == 'rocky':
4651
self.skipTest("Not supported in Rocky Linux-based images")
52+
if self.getImageVersion() > pkg_resources.parse_version("2.0"):
53+
self.skipTest("Not supported in Dataproc image version 2.1 and 2.2")
4754

48-
metadata = "cuda-version=11.1,cudnn-version=8.0.5.39,gpu-driver-provider=NVIDIA"
55+
metadata = "cuda-version=12.4,cudnn-version=9.1.0.70,gpu-driver-provider=NVIDIA"
4956

5057
self.createCluster(
5158
configuration,
5259
self.GPU_INIT_ACTIONS,
5360
timeout_in_minutes=60,
5461
machine_type="n1-standard-8",
55-
master_accelerator=self.GPU_P100,
56-
worker_accelerator=self.GPU_P100,
62+
master_accelerator=self.GPU_T4,
63+
worker_accelerator=self.GPU_T4,
5764
metadata=metadata)
5865

5966

0 commit comments

Comments
 (0)