Update

Vincent Moens · Vincent Moens · commit 2d86afc8ebc9 · 2025-03-10T17:31:35.000Z
[ghstack-poisoned]
diff --git a/test/test_env.py b/test/test_env.py
@@ -1692,6 +1692,34 @@ def test_parallel_env_device(
             env_serial.close(raise_if_closed=False)
             env0.close(raise_if_closed=False)
 
+    @pytest.mark.skipif(not _has_gym, reason="no gym")
+    @pytest.mark.parametrize("env_device", [None, "cpu"])
+    def test_parallel_env_device_vs_no_device(self, maybe_fork_ParallelEnv, env_device):
+        def make_env() -> GymEnv:
+            env = GymEnv(PENDULUM_VERSIONED(), device=env_device)
+            return env.append_transform(DoubleToFloat())
+
+        # Rollouts work with a regular env
+        parallel_env = maybe_fork_ParallelEnv(
+            num_workers=1, create_env_fn=make_env, device=None
+        )
+        parallel_env.reset()
+        parallel_env.set_seed(0)
+        torch.manual_seed(0)
+
+        parallel_rollout = parallel_env.rollout(max_steps=10)
+
+        # Rollout doesn't work with Parallelnv
+        parallel_env = maybe_fork_ParallelEnv(
+            num_workers=1, create_env_fn=make_env, device="cpu"
+        )
+        parallel_env.reset()
+        parallel_env.set_seed(0)
+        torch.manual_seed(0)
+
+        parallel_rollout_cpu = parallel_env.rollout(max_steps=10)
+        assert_allclose_td(parallel_rollout, parallel_rollout_cpu)
+
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.flaky(reruns=3, reruns_delay=1)
     @pytest.mark.parametrize(
@@ -4907,7 +4935,6 @@ def policy(td):
             if assign_done:
                 assert "terminated" in r
                 assert "done" in r
-            print(r)
 
 
 if __name__ == "__main__":
diff --git a/test/test_storage_map.py b/test/test_storage_map.py
@@ -350,6 +350,17 @@ def test_edges(self):
         edges_check = {(0, 1), (0, 2), (1, 3), (1, 4), (2, 5), (2, 6)}
         assert edges == edges_check
 
+    def test_make_node(self):
+        td = TensorDict({"obs": torch.tensor([0])})
+        tree = Tree(node_data=td)
+        assert tree.node_data is not None
+
+        tree = Tree.make_node(data=td)
+        assert tree.node_data is not None
+
+        tree = Tree.make_node(td)
+        assert tree.node_data is not None
+
 
 class TestMCTSForest:
     def dummy_rollouts(self) -> Tuple[TensorDict, ...]:
diff --git a/torchrl/_utils.py b/torchrl/_utils.py
@@ -18,7 +18,6 @@
 import warnings
 from contextlib import nullcontext
 from copy import copy
-from distutils.util import strtobool
 from functools import wraps
 from importlib import import_module
 from typing import Any, Callable, cast, TypeVar
@@ -35,6 +34,21 @@
 except ImportError:
     from torch._dynamo import is_compiling
 
+
+def strtobool(val: Any) -> bool:
+    """Convert a string representation of truth to a boolean.
+
+    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values are 'n', 'no', 'f', 'false', 'off', and '0'.
+    Raises ValueError if 'val' is anything else.
+    """
+    val = val.lower()
+    if val in ("y", "yes", "t", "true", "on", "1"):
+        return True
+    if val in ("n", "no", "f", "false", "off", "0"):
+        return False
+    raise ValueError(f"Invalid truth value {val!r}")
+
+
 LOGGING_LEVEL = os.environ.get("RL_LOGGING_LEVEL", "INFO")
 logger = logging.getLogger("torchrl")
 logger.setLevel(getattr(logging, LOGGING_LEVEL))
diff --git a/torchrl/data/llm/__init__.py b/torchrl/data/llm/__init__.py
@@ -11,7 +11,14 @@
 )
 from .prompt import PromptData, PromptTensorDictTokenizer
 from .reward import PairwiseDataset, RewardData
-from .utils import AdaptiveKLController, ConstantKLController, RolloutFromModel, LLMData, LLMOutput, LLMInput
+from .utils import (
+    AdaptiveKLController,
+    ConstantKLController,
+    LLMData,
+    LLMInput,
+    LLMOutput,
+    RolloutFromModel,
+)
 
 __all__ = [
     "AdaptiveKLController",
diff --git a/torchrl/data/llm/utils.py b/torchrl/data/llm/utils.py
@@ -543,8 +543,10 @@ def step_scheduler(self):
             while len(self._kl_queue):
                 self._kl_queue.remove(self._kl_queue[0])
 
+
 LLMInpOut = TypeVar("LLMInpOut")
 
+
 class LLMInput(TensorClass["nocast"]):
     """Represents the input to a Large Language Model (LLM).
 
@@ -557,11 +559,13 @@ class LLMInput(TensorClass["nocast"]):
     .. seealso:: :class:`~torchrl.data.LLMOutput` and :class:`~torchrl.data.LLMData`.
 
     """
+
     tokens: torch.Tensor
     attention_mask: torch.Tensor | None = None
     token_list: list[int] | list[list[int]] | None = None
     text: str | list[str] | None = None
 
+
 class LLMOutput(TensorClass["nocast"]):
     """Represents the output from a Large Language Model (LLM).
 
@@ -581,6 +585,7 @@ class LLMOutput(TensorClass["nocast"]):
     .. seealso:: :class:`~torchrl.data.LLMInput` and :class:`~torchrl.data.LLMData`.
 
     """
+
     tokens: torch.Tensor
     tokens_response: torch.Tensor | None = None
     token_list: list[int] | list[list[int]] | None = None
@@ -594,6 +599,7 @@ def from_vllm_output(cls: type[LLMInpOut], vllm_output) -> LLMInpOut:
         # placeholder
         raise NotImplementedError
 
+
 class LLMData(TensorClass["nocast"]):
     """Represents the input or output of a Large Language Model (LLM).
 
@@ -619,6 +625,7 @@ class LLMData(TensorClass["nocast"]):
     .. seealso:: :class:`~torchrl.data.LLMInput` and :class:`~torchrl.data.LLMOutput`.
 
     """
+
     tokens: torch.Tensor
     tokens_response: torch.Tensor | None = None
     attention_mask: torch.Tensor | None = None
diff --git a/torchrl/data/map/tree.py b/torchrl/data/map/tree.py
@@ -122,7 +122,7 @@ def make_node(
         return cls(
             count=torch.zeros(()),
             wins=torch.zeros(()),
-            node=data.exclude("action", "next"),
+            node_data=data.exclude("action", "next"),
             rollout=rollout,
             subtree=subtree,
             device=device,
diff --git a/torchrl/data/postprocs/postprocs.py b/torchrl/data/postprocs/postprocs.py
@@ -12,7 +12,6 @@
 from torch import nn
 
 
-
 def _get_reward(
     gamma: float,
     reward: torch.Tensor,
@@ -367,6 +366,7 @@ def __init__(
         discount: float = 1.0,
     ):
         from torchrl.objectives.value.functional import reward2go
+
         super().__init__()
         self.in_keys = [unravel_key(reward_key), unravel_key(done_key)]
         if reward_key_out is None:
diff --git a/torchrl/envs/batched_envs.py b/torchrl/envs/batched_envs.py
@@ -379,6 +379,14 @@ def __init__(
 
     is_spec_locked = EnvBase.is_spec_locked
 
+    def select_and_clone(self, name, tensor, selected_keys=None):
+        if selected_keys is None:
+            selected_keys = self._selected_step_keys
+        if name in selected_keys:
+            if self.device is not None and tensor.device != self.device:
+                return tensor.to(self.device, non_blocking=self.non_blocking)
+            return tensor.clone()
+
     @property
     def non_blocking(self):
         nb = self._non_blocking
@@ -1072,12 +1080,10 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
         selected_output_keys = self._selected_reset_keys_filt
 
         # select + clone creates 2 tds, but we can create one only
-        def select_and_clone(name, tensor):
-            if name in selected_output_keys:
-                return tensor.clone()
-
         out = self.shared_tensordict_parent.named_apply(
-            select_and_clone,
+            lambda *args: self.select_and_clone(
+                *args, selected_keys=selected_output_keys
+            ),
             nested_keys=True,
             filter_empty=True,
         )
@@ -1150,14 +1156,14 @@ def _step(
             # will be modified in-place at further steps
             device = self.device
 
-            def select_and_clone(name, tensor):
-                if name in self._selected_step_keys:
-                    return tensor.clone()
+            selected_keys = self._selected_step_keys
 
             if partial_steps is not None:
                 next_td = TensorDict.lazy_stack([next_td[i] for i in workers_range])
             out = next_td.named_apply(
-                select_and_clone, nested_keys=True, filter_empty=True
+                lambda *args: self.select_and_clone(*args, selected_keys),
+                nested_keys=True,
+                filter_empty=True,
             )
             if out_tds is not None:
                 out.update(
@@ -2010,20 +2016,8 @@ def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
         next_td = shared_tensordict_parent.get("next")
         device = self.device
 
-        if next_td.device != device and device is not None:
-
-            def select_and_clone(name, tensor):
-                if name in self._selected_step_keys:
-                    return tensor.to(device, non_blocking=self.non_blocking)
-
-        else:
-
-            def select_and_clone(name, tensor):
-                if name in self._selected_step_keys:
-                    return tensor.clone()
-
         out = next_td.named_apply(
-            select_and_clone,
+            self.select_and_clone,
             nested_keys=True,
             filter_empty=True,
             device=device,
@@ -2203,20 +2197,10 @@ def tentative_update(val, other):
         selected_output_keys = self._selected_reset_keys_filt
         device = self.device
 
-        if self.shared_tensordict_parent.device != device and device is not None:
-
-            def select_and_clone(name, tensor):
-                if name in selected_output_keys:
-                    return tensor.to(device, non_blocking=self.non_blocking)
-
-        else:
-
-            def select_and_clone(name, tensor):
-                if name in selected_output_keys:
-                    return tensor.clone()
-
         out = self.shared_tensordict_parent.named_apply(
-            select_and_clone,
+            lambda *args: self.select_and_clone(
+                *args, selected_keys=selected_output_keys
+            ),
             nested_keys=True,
             filter_empty=True,
             device=device,
diff --git a/torchrl/envs/custom/llm.py b/torchrl/envs/custom/llm.py
@@ -80,41 +80,37 @@ def __init__(
             self._batch_locked = False
         else:
             self._batch_locked = True
-        super().__init__(device=device, batch_size=() if batch_size is None else (batch_size,))
+        super().__init__(
+            device=device, batch_size=() if batch_size is None else (batch_size,)
+        )
         self.str2str = str2str
         self.vocab_size = vocab_size
         self.observation_key = unravel_key(token_key)
-        self.attention_key = unravel_key(attention_key)
+        if attention_key is not None:
+            attention_key = unravel_key(attention_key)
+        self.attention_key = attention_key
         self.no_stack = no_stack
         self.assign_reward = assign_reward
         self.assign_done = assign_done
 
         # self.action_key = unravel_key(action_key)
         if str2str:
             self.full_observation_spec_unbatched = Composite(
-                {
-                    token_key: NonTensor(
-                        example_data="a string", batched=True, shape=()
-                    )
-                }
+                {token_key: NonTensor(example_data="a string", batched=True, shape=())}
             )
             self.full_action_spec_unbatched = Composite(
                 {action_key: NonTensor(example_data="a string", batched=True, shape=())}
             )
         else:
             if vocab_size is None:
                 observation_spec = {
-                        token_key: Unbounded(
-                            shape=(-1,), dtype=torch.int64, device=device
-                        )
-                    }
+                    token_key: Unbounded(shape=(-1,), dtype=torch.int64, device=device)
+                }
                 if attention_key is not None:
                     observation_spec[attention_key] = Unbounded(
-                            shape=(-1,), dtype=torch.int64, device=device
-                        )
-                self.full_observation_spec_unbatched = Composite(
-                    observation_spec
-                )
+                        shape=(-1,), dtype=torch.int64, device=device
+                    )
+                self.full_observation_spec_unbatched = Composite(observation_spec)
                 self.full_action_spec_unbatched = Composite(
                     {
                         action_key: Unbounded(
@@ -392,7 +388,6 @@ def _make_next_obs(
 
     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
         # We should have an observation by this time, if not raise an exception
-        print('tensordict', tensordict)
         if tensordict is None or self.observation_key not in tensordict.keys(
             isinstance(self.observation_key, tuple)
         ):
diff --git a/torchrl/envs/libs/unity_mlagents.py b/torchrl/envs/libs/unity_mlagents.py
@@ -132,7 +132,6 @@ def _collect_agents(self, env):
         for steps_idx in [0, 1]:
             for behavior in env.behavior_specs.keys():
                 steps = env.get_steps(behavior)[steps_idx]
-                is_terminal = steps_idx == 1
                 agent_ids = steps.agent_id
                 group_ids = steps.group_id
 
diff --git a/torchrl/envs/utils.py b/torchrl/envs/utils.py
@@ -1407,7 +1407,6 @@ def _update_during_reset(
     if not reset_keys:
         return tensordict.update(tensordict_reset)
     roots = set()
-    print("reset_keys", reset_keys)
     for reset_key in reset_keys:
         # get the node of the reset key
         if isinstance(reset_key, tuple):
@@ -1423,7 +1422,6 @@ def _update_during_reset(
             reset_key_tuple = (reset_key,)
         # get the reset signal
         reset = tensordict.pop(reset_key, None)
-        print("reset popped", reset)
 
         # check if this reset should be ignored -- this happens whenever the
         # root node has already been updated
diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py
@@ -1281,8 +1281,18 @@ def __init__(
             skip_existing=skip_existing,
             device=device,
         )
-        self.register_buffer("gamma", torch.tensor(gamma, device=self._device))
-        self.register_buffer("lmbda", torch.tensor(lmbda, device=self._device))
+        self.register_buffer(
+            "gamma",
+            gamma.to(self._device)
+            if isinstance(gamma, Tensor)
+            else torch.tensor(gamma, device=self._device),
+        )
+        self.register_buffer(
+            "lmbda",
+            lmbda.to(self._device)
+            if isinstance(lmbda, Tensor)
+            else torch.tensor(lmbda, device=self._device),
+        )
         self.average_gae = average_gae
         self.vectorized = vectorized
         self.time_dim = time_dim