[Feature] pass policy-factory in mp data collectors

vmoens · vmoens · commit 667466c95abd · 2025-03-18T16:24:48.000-07:00
ghstack-source-id: 369e690 Pull Request resolved: #2859
diff --git a/examples/collectors/mp_collector_mps.py b/examples/collectors/mp_collector_mps.py
@@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Updating MPS weights in multiprocess/distributed data collectors
+================================================================
+
+Overview of the Script
+----------------------
+
+This script demonstrates a weight update in TorchRL.
+The script uses a custom `MPSRemoteWeightUpdater` class to update the weights of a policy network across multiple workers.
+
+Key Features
+------------
+
+- Multi-Worker Setup: The script creates two worker processes that collect data from a Gym environment
+  ("Pendulum-v1") using a policy network.
+- MPS (Metal Performance Shaders) Device: The policy network is placed on an MPS device.
+- Custom Weight Updater: The `MPSRemoteWeightUpdater` class is used to update the policy weights across workers. This
+  class is necessary because MPS tensors cannot be sent over a pipe due to serialization/pickling issues in PyTorch.
+
+Workaround for MPS Tensor Serialization Issue
+---------------------------------------------
+
+In PyTorch, MPS tensors cannot be serialized or pickled, which means they cannot be sent over a pipe or shared between
+processes. To work around this issue, the MPSRemoteWeightUpdater class sends the policy weights on the CPU device
+instead of the MPS device. The local workers then copy the weights from the CPU device to the MPS device.
+
+Script Flow
+-----------
+
+1. Initialize the environment, policy network, and collector.
+2. Update the policy weights using the MPSRemoteWeightUpdater.
+3. Collect data from the environment using the policy network.
+4. Zero out the policy weights after a few iterations.
+5. Verify that the updated policy weights are being used by checking the actions generated by the policy network.
+
+"""
+
+import tensordict
+import torch
+from tensordict import TensorDictBase
+from tensordict.nn import TensorDictModule
+from torch import nn
+from torchrl.collectors import MultiSyncDataCollector, RemoteWeightUpdaterBase
+
+from torchrl.envs.libs.gym import GymEnv
+
+
+class MPSRemoteWeightUpdater(RemoteWeightUpdaterBase):
+    def __init__(self, policy_weights, num_workers):
+        # Weights are on mps device, which cannot be shared
+        self.policy_weights = policy_weights.data
+        self.num_workers = num_workers
+
+    def _sync_weights_with_worker(
+        self, worker_id: int | torch.device, server_weights: TensorDictBase
+    ) -> TensorDictBase:
+        # Send weights on cpu - the local workers will do the cpu->mps copy
+        self.collector.pipes[worker_id].send((server_weights, "update"))
+        val, msg = self.collector.pipes[worker_id].recv()
+        assert msg == "updated"
+        return server_weights
+
+    def _get_server_weights(self) -> TensorDictBase:
+        print((self.policy_weights == 0).all())
+        return self.policy_weights.cpu()
+
+    def _maybe_map_weights(self, server_weights: TensorDictBase) -> TensorDictBase:
+        print((server_weights == 0).all())
+        return server_weights
+
+    def all_worker_ids(self) -> list[int] | list[torch.device]:
+        return list(range(self.num_workers))
+
+
+if __name__ == "__main__":
+    device = "mps"
+
+    def env_maker():
+        return GymEnv("Pendulum-v1", device="cpu")
+
+    def policy_factory(device=device):
+        return TensorDictModule(
+            nn.Linear(3, 1), in_keys=["observation"], out_keys=["action"]
+        ).to(device=device)
+
+    policy = policy_factory()
+    policy_weights = tensordict.from_module(policy)
+
+    collector = MultiSyncDataCollector(
+        create_env_fn=[env_maker, env_maker],
+        policy_factory=policy_factory,
+        total_frames=2000,
+        max_frames_per_traj=50,
+        frames_per_batch=200,
+        init_random_frames=-1,
+        reset_at_each_iter=False,
+        device=device,
+        storing_device="cpu",
+        remote_weights_updater=MPSRemoteWeightUpdater(policy_weights, 2),
+        # use_buffers=False,
+        # cat_results="stack",
+    )
+
+    collector.update_policy_weights_()
+    try:
+        for i, data in enumerate(collector):
+            if i == 2:
+                print(data)
+                assert (data["action"] != 0).any()
+                # zero the policy
+                policy_weights.data.zero_()
+                collector.update_policy_weights_()
+            elif i == 3:
+                assert (data["action"] == 0).all(), data["action"]
+                break
+    finally:
+        collector.shutdown()
diff --git a/test/test_collector.py b/test/test_collector.py
@@ -39,7 +39,11 @@
     prod,
     seed_generator,
 )
-from torchrl.collectors import aSyncDataCollector, SyncDataCollector
+from torchrl.collectors import (
+    aSyncDataCollector,
+    RemoteWeightUpdaterBase,
+    SyncDataCollector,
+)
 from torchrl.collectors.collectors import (
     _Interruptor,
     MultiaSyncDataCollector,
@@ -146,6 +150,7 @@
 PYTHON_3_10 = sys.version_info.major == 3 and sys.version_info.minor == 10
 PYTHON_3_7 = sys.version_info.major == 3 and sys.version_info.minor == 7
 TORCH_VERSION = version.parse(version.parse(torch.__version__).base_version)
+_has_cuda = torch.cuda.is_available()
 
 
 class WrappablePolicy(nn.Module):
@@ -3476,6 +3481,69 @@ def __deepcopy_error__(*args, **kwargs):
     raise RuntimeError("deepcopy not allowed")
 
 
+class TestPolicyFactory:
+    class MPSRemoteWeightUpdater(RemoteWeightUpdaterBase):
+        def __init__(self, policy_weights, num_workers):
+            # Weights are on mps device, which cannot be shared
+            self.policy_weights = policy_weights.data
+            self.num_workers = num_workers
+
+        def _sync_weights_with_worker(
+            self, worker_id: int | torch.device, server_weights: TensorDictBase
+        ) -> TensorDictBase:
+            # Send weights on cpu - the local workers will do the cpu->mps copy
+            self.collector.pipes[worker_id].send((server_weights, "update"))
+            val, msg = self.collector.pipes[worker_id].recv()
+            assert msg == "updated"
+            return server_weights
+
+        def _get_server_weights(self) -> TensorDictBase:
+            return self.policy_weights.cpu()
+
+        def _maybe_map_weights(self, server_weights: TensorDictBase) -> TensorDictBase:
+            return server_weights
+
+        def all_worker_ids(self) -> list[int] | list[torch.device]:
+            return list(range(self.num_workers))
+
+    @pytest.mark.skipif(not _has_cuda, reason="requires cuda another device than CPU.")
+    def test_weight_update(self):
+        device = "cuda:0"
+        env_maker = lambda: GymEnv("Pendulum-v1", device="cpu")
+        policy_factory = lambda: TensorDictModule(
+            nn.Linear(3, 1), in_keys=["observation"], out_keys=["action"]
+        ).to(device)
+        policy = policy_factory()
+        policy_weights = TensorDict.from_module(policy)
+
+        collector = MultiSyncDataCollector(
+            create_env_fn=[env_maker, env_maker],
+            policy_factory=policy_factory,
+            total_frames=2000,
+            max_frames_per_traj=50,
+            frames_per_batch=200,
+            init_random_frames=-1,
+            reset_at_each_iter=False,
+            device=device,
+            storing_device="cpu",
+            remote_weights_updater=self.MPSRemoteWeightUpdater(policy_weights, 2),
+        )
+
+        collector.update_policy_weights_()
+        try:
+            for i, data in enumerate(collector):
+                if i == 2:
+                    assert (data["action"] != 0).any()
+                    # zero the policy
+                    policy_weights.data.zero_()
+                    collector.update_policy_weights_()
+                elif i == 3:
+                    assert (data["action"] == 0).all(), data["action"]
+                    break
+        finally:
+            collector.shutdown()
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -837,6 +837,9 @@ def __init__(
             )
 
         self.local_weights_updater = local_weights_updater
+        if remote_weights_updater is not None:
+            remote_weights_updater.register_collector(self)
+
         self.remote_weights_updater = remote_weights_updater
 
     @property
@@ -1827,10 +1830,13 @@ def __init__(
                 "remote_weights_updater cannot be None when policy_factory is provided."
             )
 
+        if remote_weights_updater is not None:
+            remote_weights_updater.register_collector(self)
         self.remote_weights_updater = remote_weights_updater
         self.local_weights_updater = local_weights_updater
 
         self.policy = policy
+        self.policy_factory = policy_factory
 
         remainder = 0
         if total_frames is None or total_frames < 0:
@@ -2012,6 +2018,10 @@ def _run_processes(self) -> None:
                 env_fun = CloudpickleWrapper(env_fun)
 
             # Create a policy on the right device
+            policy_factory = self.policy_factory
+            if policy_factory is not None:
+                policy_factory = CloudpickleWrapper(policy_factory)
+
             policy_device = self.policy_device[i]
             storing_device = self.storing_device[i]
             env_device = self.env_device[i]
@@ -2020,13 +2030,14 @@ def _run_processes(self) -> None:
             #  This makes sure that a given set of shared weights for a given device are
             #  shared for all policies that rely on that device.
             policy = self.policy
-            policy_weights = self._policy_weights_dict[policy_device]
+            policy_weights = self._policy_weights_dict.get(policy_device)
             if policy is not None and policy_weights is not None:
                 cm = policy_weights.to_module(policy)
             else:
                 cm = contextlib.nullcontext()
             with cm:
                 kwargs = {
+                    "policy_factory": policy_factory,
                     "pipe_parent": pipe_parent,
                     "pipe_child": pipe_child,
                     "queue_out": queue_out,
@@ -3107,6 +3118,7 @@ def _main_async_collector(
     compile_policy: bool = False,
     cudagraph_policy: bool = False,
     no_cuda_sync: bool = False,
+    policy_factory: Callable | None = None,
 ) -> None:
     pipe_parent.close()
     # init variables that will be cleared when closing
@@ -3116,6 +3128,7 @@ def _main_async_collector(
         create_env_fn,
         create_env_kwargs=create_env_kwargs,
         policy=policy,
+        policy_factory=policy_factory,
         total_frames=-1,
         max_frames_per_traj=max_frames_per_traj,
         frames_per_batch=frames_per_batch,
@@ -3278,7 +3291,7 @@ def cast_tensor(x, MPS_ERROR=MPS_ERROR):
                 continue
 
         elif msg == "update":
-            inner_collector.update_policy_weights_()
+            inner_collector.update_policy_weights_(policy_weights=data_in)
             pipe_child.send((j, "updated"))
             has_timed_out = False
             continue
diff --git a/torchrl/collectors/weight_update.py b/torchrl/collectors/weight_update.py
@@ -6,7 +6,7 @@
 
 import abc
 from abc import abstractmethod
-from typing import Callable, TypeVar
+from typing import Any, Callable, TypeVar
 
 import torch
 from tensordict import TensorDictBase
@@ -110,6 +110,21 @@ class RemoteWeightUpdaterBase(metaclass=abc.ABCMeta):
 
     """
 
+    collector: Any = None
+
+    def register_collector(self, collector: DataCollectorBase):  # noqa
+        """Register a collector in the updater.
+
+        Once registered, the updater will not accept another collector.
+
+        Args:
+            collector (DataCollectorBase): The collector to register.
+
+        """
+        if self.collector is not None:
+            raise RuntimeError("Cannot register collector twice.")
+        self.collector = collector
+
     @abstractmethod
     def _sync_weights_with_worker(
         self, worker_id: int | torch.device, server_weights: TensorDictBase
diff --git a/torchrl/data/utils.py b/torchrl/data/utils.py
@@ -222,7 +222,15 @@ def contains_lazy_spec(spec: TensorSpec) -> bool:
     return False
 
 
-class CloudpickleWrapper:
+class _CloudpickleWrapperMeta(type):
+    def __call__(cls, obj):
+        if isinstance(obj, cls):
+            return obj
+        else:
+            return super().__call__(obj)
+
+
+class CloudpickleWrapper(metaclass=_CloudpickleWrapperMeta):
     """A wrapper for functions that allow for serialization in multiprocessed settings."""
 
     def __init__(self, fn: Callable, **kwargs):