Make c10d pickling/unpickling work (pytorch#12694)

teng-li · facebook-github-bot · commit d120b9af5ad2 · 2018-10-19T16:42:36.000-07:00
Summary: This fixes the issue for pytorch#12168 Pull Request resolved: pytorch#12694 Differential Revision: D10468717 Pulled By: teng-li fbshipit-source-id: 3df31d75eea19d6085af665f5350d3cb667a5048
diff --git a/test/test_distributed.py b/test/test_distributed.py
@@ -28,10 +28,29 @@
 DEFAULT_TIMEOUT = 300
 CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500}
 
+
 if INIT_METHOD.startswith("file://"):
     FOLDER = INIT_METHOD[7:]
 
 
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc1 = nn.Linear(2, 10, bias=False)
+        self.fc2 = nn.Linear(10, 50, bias=False)
+        self.fc3 = nn.Linear(50, 4, bias=False)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.relu(self.fc1(x))
+        x = self.relu(self.fc2(x))
+        x = self.fc3(x)
+        return F.softmax(x, dim=1)
+
+
+DDP_NET = Net()
+
+
 def get_timeout(test_id):
     test_name = test_id.split(".")[-1]
     if test_name in CUSTOMIZED_TIMEOUT:
@@ -44,6 +63,7 @@ def get_timeout(test_id):
     print("Distributed not available, skipping tests")
     sys.exit(0)
 
+
 SKIP_IF_NO_CUDA_EXIT_CODE = 75
 SKIP_IF_NO_GPU_EXIT_CODE = 76
 SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77
@@ -1109,23 +1129,6 @@ def test_all_gather_multigpu(self):
         rank_to_GPU = self._init_multigpu_helper()
         self._test_all_gather_multigpu_helper(group, group_id, rank, rank_to_GPU)
 
-    def _create_Net(self):
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.fc1 = nn.Linear(2, 10, bias=False)
-                self.fc2 = nn.Linear(10, 50, bias=False)
-                self.fc3 = nn.Linear(50, 4, bias=False)
-                self.relu = nn.ReLU()
-
-            def forward(self, x):
-                x = self.relu(self.fc1(x))
-                x = self.relu(self.fc2(x))
-                x = self.fc3(x)
-                return F.softmax(x, dim=1)
-
-        return Net()
-
     def _model_step(self, model):
         for param in model.parameters():
             param.data += param.grad
@@ -1193,7 +1196,7 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
         # as baseline
 
         # cpu training setup
-        model = self._create_Net()
+        model = DDP_NET
 
         # single gpu training setup
         model_gpu = copy.deepcopy(model)
@@ -1206,6 +1209,12 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
             model_DDP, device_ids=gpu_subset
         )
 
+        # test serializable/unserializable
+        if INIT_METHOD.startswith("file://"):
+            _, filename = tempfile.mkstemp(prefix=FOLDER)
+            torch.save(model_DDP, filename)
+            model_DDP = torch.load(filename)
+
         # dummy data initialization
         local_bs = len(gpu_subset)
         global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
@@ -1232,7 +1241,7 @@ def test_DistributedDataParallelCPU(self):
         group, group_id, rank = self._init_global_test()
 
         # cpu training setup
-        model_base = self._create_Net()
+        model_base = DDP_NET
 
         # DDP-CPU training setup
         model_DDP = copy.deepcopy(model_base)
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -138,8 +138,6 @@ def __init__(self, module, device_ids=None,
         self.output_device = _get_device_index(output_device, True)
         self.broadcast_buffers = broadcast_buffers
 
-        self.allreduce_opts = dist.AllreduceOptions()
-
         MB = 1024 * 1024
 
         # used for intra-node param sync and inter-node sync as well
@@ -207,26 +205,39 @@ def __init__(self, module, device_ids=None,
         self.next_bucket = len(self.bucket_sizes) - 1
         self.ready_buckets_not_reduced = set()
         self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
-
         self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
-
-        # default stream tracking to launch nccl reduce kernels
-        self.default_streams = []
-        for dev_id in self.device_ids:
-            with torch.cuda.device(dev_id):
-                self.default_streams.append(torch.cuda.current_stream())
-
         self._register_grad_hooks()
 
     def __getstate__(self):
+        self._check_default_group()
         attrs = copy.copy(self.__dict__)
-        del attrs['_grad_accs']
+        del attrs['process_group'], \
+            attrs['allreduce_opts'], \
+            attrs['default_streams'], \
+            attrs['_grad_accs']
         return attrs
 
     def __setstate__(self, state):
+        # If serializable, then the process group should be the default one
+        self.process_group = dist.get_default_group()
         super(DistributedDataParallel, self).__setstate__(state)
         self._register_grad_hooks()
 
+    def _check_default_group(self):
+        pickle_not_supported = False
+        try:
+            if self.process_group != dist.get_default_group():
+                pickle_not_supported = True
+        except RuntimeError:
+            pickle_not_supported = True
+
+        if pickle_not_supported:
+            raise RuntimeError("DDP Pickling/Unpickling are only supported "
+                               "when using DDP with the default process "
+                               "group. That is, when you have called "
+                               "init_process_group and have not passed "
+                               "process_group argument to DDP constructor")
+
     def forward(self, *inputs, **kwargs):
         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
         self._sync_params()
@@ -279,6 +290,15 @@ def _sync_params(self):
 
     def _register_grad_hooks(self):
         self._grad_accs = []  # need to keep them in scope
+
+        # default stream tracking to launch nccl reduce kernels
+        self.default_streams = []
+        for dev_id in self.device_ids:
+            with torch.cuda.device(dev_id):
+                self.default_streams.append(torch.cuda.current_stream())
+
+        self.allreduce_opts = dist.AllreduceOptions()
+
         for device_idx, module in enumerate(self._module_copies):
             for p in module.parameters():
                 if p.requires_grad: