1
+ import pkg_resources
2
+
1
3
from absl .testing import absltest
2
4
from absl .testing import parameterized
3
5
@@ -9,6 +11,7 @@ class HorovodTestCase(DataprocTestCase):
9
11
INIT_ACTIONS = ["horovod/horovod.sh" ]
10
12
GPU_INIT_ACTIONS = ["gpu/install_gpu_driver.sh" ] + INIT_ACTIONS
11
13
GPU_P100 = "type=nvidia-tesla-p100"
14
+ GPU_T4 = "type=nvidia-tesla-t4"
12
15
13
16
TENSORFLOW_TEST_SCRIPT = "scripts/verify_tensorflow.py"
14
17
PYTORCH_TEST_SCRIPT = "scripts/verify_pytorch.py"
@@ -26,6 +29,8 @@ def _submit_spark_job(self, script):
26
29
def test_horovod_cpu (self , configuration , controller ):
27
30
if self .getImageOs () == 'rocky' :
28
31
self .skipTest ("Not supported in Rocky Linux-based images" )
32
+ if self .getImageVersion () > pkg_resources .parse_version ("2.0" ):
33
+ self .skipTest ("Not supported in Dataproc image version 2.1 and 2.2" )
29
34
30
35
metadata = ""
31
36
if controller == "mpi" :
@@ -44,16 +49,18 @@ def test_horovod_cpu(self, configuration, controller):
44
49
def test_horovod_gpu (self , configuration , controller ):
45
50
if self .getImageOs () == 'rocky' :
46
51
self .skipTest ("Not supported in Rocky Linux-based images" )
52
+ if self .getImageVersion () > pkg_resources .parse_version ("2.0" ):
53
+ self .skipTest ("Not supported in Dataproc image version 2.1 and 2.2" )
47
54
48
- metadata = "cuda-version=11.1 ,cudnn-version=8.0.5.39 ,gpu-driver-provider=NVIDIA"
55
+ metadata = "cuda-version=12.4 ,cudnn-version=9.1.0.70 ,gpu-driver-provider=NVIDIA"
49
56
50
57
self .createCluster (
51
58
configuration ,
52
59
self .GPU_INIT_ACTIONS ,
53
60
timeout_in_minutes = 60 ,
54
61
machine_type = "n1-standard-8" ,
55
- master_accelerator = self .GPU_P100 ,
56
- worker_accelerator = self .GPU_P100 ,
62
+ master_accelerator = self .GPU_T4 ,
63
+ worker_accelerator = self .GPU_T4 ,
57
64
metadata = metadata )
58
65
59
66
0 commit comments