28
28
DEFAULT_TIMEOUT = 300
29
29
CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel" : 500 }
30
30
31
+
31
32
if INIT_METHOD .startswith ("file://" ):
32
33
FOLDER = INIT_METHOD [7 :]
33
34
34
35
36
+ class Net (nn .Module ):
37
+ def __init__ (self ):
38
+ super (Net , self ).__init__ ()
39
+ self .fc1 = nn .Linear (2 , 10 , bias = False )
40
+ self .fc2 = nn .Linear (10 , 50 , bias = False )
41
+ self .fc3 = nn .Linear (50 , 4 , bias = False )
42
+ self .relu = nn .ReLU ()
43
+
44
+ def forward (self , x ):
45
+ x = self .relu (self .fc1 (x ))
46
+ x = self .relu (self .fc2 (x ))
47
+ x = self .fc3 (x )
48
+ return F .softmax (x , dim = 1 )
49
+
50
+
51
+ DDP_NET = Net ()
52
+
53
+
35
54
def get_timeout (test_id ):
36
55
test_name = test_id .split ("." )[- 1 ]
37
56
if test_name in CUSTOMIZED_TIMEOUT :
@@ -44,6 +63,7 @@ def get_timeout(test_id):
44
63
print ("Distributed not available, skipping tests" )
45
64
sys .exit (0 )
46
65
66
+
47
67
SKIP_IF_NO_CUDA_EXIT_CODE = 75
48
68
SKIP_IF_NO_GPU_EXIT_CODE = 76
49
69
SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77
@@ -1109,23 +1129,6 @@ def test_all_gather_multigpu(self):
1109
1129
rank_to_GPU = self ._init_multigpu_helper ()
1110
1130
self ._test_all_gather_multigpu_helper (group , group_id , rank , rank_to_GPU )
1111
1131
1112
- def _create_Net (self ):
1113
- class Net (nn .Module ):
1114
- def __init__ (self ):
1115
- super (Net , self ).__init__ ()
1116
- self .fc1 = nn .Linear (2 , 10 , bias = False )
1117
- self .fc2 = nn .Linear (10 , 50 , bias = False )
1118
- self .fc3 = nn .Linear (50 , 4 , bias = False )
1119
- self .relu = nn .ReLU ()
1120
-
1121
- def forward (self , x ):
1122
- x = self .relu (self .fc1 (x ))
1123
- x = self .relu (self .fc2 (x ))
1124
- x = self .fc3 (x )
1125
- return F .softmax (x , dim = 1 )
1126
-
1127
- return Net ()
1128
-
1129
1132
def _model_step (self , model ):
1130
1133
for param in model .parameters ():
1131
1134
param .data += param .grad
@@ -1193,7 +1196,7 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
1193
1196
# as baseline
1194
1197
1195
1198
# cpu training setup
1196
- model = self . _create_Net ()
1199
+ model = DDP_NET
1197
1200
1198
1201
# single gpu training setup
1199
1202
model_gpu = copy .deepcopy (model )
@@ -1206,6 +1209,12 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
1206
1209
model_DDP , device_ids = gpu_subset
1207
1210
)
1208
1211
1212
+ # test serializable/unserializable
1213
+ if INIT_METHOD .startswith ("file://" ):
1214
+ _ , filename = tempfile .mkstemp (prefix = FOLDER )
1215
+ torch .save (model_DDP , filename )
1216
+ model_DDP = torch .load (filename )
1217
+
1209
1218
# dummy data initialization
1210
1219
local_bs = len (gpu_subset )
1211
1220
global_bs , input_cpu , target , loss = self ._prepare_dummy_data (local_bs )
@@ -1232,7 +1241,7 @@ def test_DistributedDataParallelCPU(self):
1232
1241
group , group_id , rank = self ._init_global_test ()
1233
1242
1234
1243
# cpu training setup
1235
- model_base = self . _create_Net ()
1244
+ model_base = DDP_NET
1236
1245
1237
1246
# DDP-CPU training setup
1238
1247
model_DDP = copy .deepcopy (model_base )
0 commit comments