[Quality] Better defaults for vllm wrapper and LLMEnv

Vincent Moens · Vincent Moens · commit afc5d5916d5d · 2025-03-26T15:54:06.000Z
ghstack-source-id: 88d6f0b Pull Request resolved: #2874
diff --git a/test/test_actors.py b/test/test_actors.py
@@ -1333,10 +1333,7 @@ def test_vllm_batch_run(self, pad, generate, use_tensorclass, vllm_instance):
     def test_vllm_collection(self, vllm_instance):
         policy = vLLMWrapper(
             vllm_instance,
-            from_text=True,
-            generate=True,
             return_log_probs=True,
-            pad_output=False,
             generate_kwargs={"max_tokens": 10},
         )
         self._run_check_collector(policy)
@@ -1348,7 +1345,10 @@ def test_transformers_collection(self):
     def env_constructor(cls):
         dl = DummyStrDataLoader(batch_size=32)
         env = LLMEnv.from_dataloader(
-            dl, batch_size=16, repeats=4, str2str=True, group_repeats=True
+            dl,
+            batch_size=16,
+            repeats=4,
+            # str2str=True, group_repeats=True
         )
         assert env.batch_size == (64,)
         return env
@@ -1364,6 +1364,15 @@ def _run_check_collector(self, policy):
         for data in collector:
             assert isinstance(data, LazyStackedTensorDict)
             assert isinstance(data.reshape(-1).get("text_response"), NonTensorStack)
+            # action
+            assert "text_response" in data
+            assert "tokens_response" in data
+            # obs
+            assert "text" in data
+            assert ("next", "text") in data
+            # tokens
+            assert "tokens" in data
+            # assert ("next", "tokens") in data
 
 
 if __name__ == "__main__":
diff --git a/torchrl/envs/custom/llm.py b/torchrl/envs/custom/llm.py
@@ -48,6 +48,9 @@ class LLMEnv(EnvBase):
     Prompts to the language model can be loaded when the environment is ``reset`` if the environment is created via
     :meth:`~from_dataloader`.
 
+    .. note:: The default arguments of the `LLMEnv` class are set to make it easy to run this environment with
+        the vllm backend (:class:`~torchrl.modules.vLLMWrapper`).
+
     Keyword Args:
         token_key (NestedKey, optional): The key in the tensordict where the tokens are stored (when `str2str=False`).
             Defaults to ``"tokens"``.
@@ -59,7 +62,7 @@ class LLMEnv(EnvBase):
             ``"tokens_response"`` or ``"text_response"``.
         reward_key (NestedKey, optional): The key in the tensordict where the reward is stored if `assign_reward=True`.
             Defaults to  ``"reward"``.
-        str2str (bool, optional): Whether the environment should expect strings as input and output. Defaults to ``False``.
+        str2str (bool, optional): Whether the environment should expect strings as input and output. Defaults to ``True``.
         device (torch.device | None, optional): The device on which the environment should run. Defaults to ``None``.
         vocab_size (int | None, optional): The size of the vocabulary. If None, the environment will assume an
             unbounded vocabulary. Defaults to ``None``.
@@ -102,7 +105,7 @@ def __init__(
         attention_key: NestedKey | None = None,
         action_key: NestedKey | None = None,
         reward_key: NestedKey = "reward",
-        str2str: bool = False,
+        str2str: bool = True,
         device: torch.device | None = None,
         vocab_size: int | None = None,
         no_stack: bool = True,
@@ -250,7 +253,7 @@ def from_dataloader(
         attention_key: NestedKey | None = None,
         action_key: NestedKey | None = None,
         reward_key: NestedKey = "reward",
-        str2str: bool = False,
+        str2str: bool = True,
         device: torch.device | None = None,
         vocab_size: int | None = None,
         no_stack: bool = False,
@@ -267,7 +270,7 @@ def from_dataloader(
         stack_method: Callable[[Any], Any]
         | Literal["as_nested_tensor", "as_padded_tensor"] = None,
         repeats: int | None = None,
-        group_repeats: bool = False,
+        group_repeats: bool = True,
     ) -> LLMEnv:
         """Creates an LLMEnv instance from a dataloader.
 
@@ -297,7 +300,7 @@ def from_dataloader(
                 ``("tokens_out", "sequences")``.
             reward_key (NestedKey, optional): The key in the tensordict where the reward is stored if `assign_reward=True`.
                 Defaults to  ``"reward"``.
-            str2str (bool, optional): Whether the environment should expect strings as input and output. Defaults to ``False``.
+            str2str (bool, optional): Whether the environment should expect strings as input and output. Defaults to ``True``.
             device (torch.device | None, optional): The device on which the environment should run. Defaults to ``None``.
             vocab_size (int | None, optional): The size of the vocabulary. If None, the environment will assume an
                 unbounded vocabulary. Defaults to ``None``.
@@ -334,7 +337,7 @@ def from_dataloader(
                 situations like GRPO where a single prompt is used multiple times to estimate the advantage using Monte-Carlo
                 samples (rather than an advantage module).
             group_repeats (bool, optional): if ``True``, the batch-size is multiplied by the number of repeats such that
-                all repeats are grouped in a single batch collected from the buffer. Defaults to ``False``.
+                all repeats are grouped in a single batch collected from the buffer. Defaults to ``True``.
 
         Returns:
             LLMEnv: The created LLMEnv instance.
diff --git a/torchrl/modules/llm/transformers_wrapper.py b/torchrl/modules/llm/transformers_wrapper.py
@@ -31,7 +31,7 @@ class TransformersWrapper(CategoricalSequential):
             encoding and decoding text. If `None`, the tokenizer associated with the model will be used. Defaults to
             `None`.
         from_text (bool, optional): Indicates whether the input is in text format. If `True`, the input is expected to
-            be text that will be tokenized. If `False`, the input is expected to be token sequences. Defaults to `False`.
+            be text that will be tokenized. If `False`, the input is expected to be token sequences. Defaults to `True`.
         device (torch.device | None, optional): The device to use for computation. If `None`, the default device will
             be used. Defaults to `None`.
         generate (bool, optional): Whether to enable text generation. If `True`, the model will generate text based on
@@ -86,8 +86,8 @@ class TransformersWrapper(CategoricalSequential):
         >>> output_data = wrapper(input_data)
         >>> print(output_data["text_response"])
 
-    .. seealso:: :func:`~torchrl.modules.from_hf_transformers` for a similar interface using the Hugging Face
-        Transformers library.
+    .. seealso:: :func:`~torchrl.modules.vLLMWrapper` for a similar interface using vLLM.
+
     """
 
     text_key: NestedKey = ("text",)
@@ -105,7 +105,7 @@ def __init__(
         tokenizer: transformers.tokenization_utils.PreTrainedTokenizer  # noqa
         | None = None,
         # noqa
-        from_text: bool = False,
+        from_text: bool = True,
         device: torch.device | None = None,
         generate: bool = True,
         generate_kwargs: dict | None = None,
diff --git a/torchrl/modules/llm/vllm_wrapper.py b/torchrl/modules/llm/vllm_wrapper.py
@@ -13,7 +13,7 @@
     LazyStackedTensorDict,
     NestedKey,
     TensorDict,
-    TensorDictBase,
+    TensorDictBase, maybe_dense_stack,
 )
 from tensordict.tensorclass import from_dataclass, NonTensorStack, TensorClass
 from tensordict.utils import _zip_strict, expand_as_right
@@ -28,6 +28,9 @@ class vLLMWrapper(CategoricalSequential):
     This class allows for handling both text and token inputs, enabling text generation and log probability
     computation based on the specified configuration.
 
+    .. note:: The default arguments of the `vLLMWrapper` class are set to make it easy to run this backend with
+        the :class:`~torchrl.envs.custom.llm.LLMEnv` class.
+
     Args:
         model (vllm.LLM): The vLLM model to wrap.
 
@@ -38,7 +41,7 @@ class vLLMWrapper(CategoricalSequential):
             encoding and decoding text. If `None`, the tokenizer associated with the model will be used. Defaults to
             `None`.
         from_text (bool, optional): Indicates whether the input is in text format. If `True`, the input is expected to
-            be text that will be tokenized. If `False`, the input is expected to be token sequences. Defaults to `False`.
+            be text that will be tokenized. If `False`, the input is expected to be token sequences. Defaults to `True`.
         device (torch.device | None, optional): The device to use for computation. If `None`, the default device will
             be used. Defaults to `None`.
         generate (bool, optional): Whether to enable text generation. If `True`, the model will generate text based on
@@ -50,7 +53,11 @@ class vLLMWrapper(CategoricalSequential):
             control aspects of the tokenization process, such as padding and truncation. Defaults to `None`.
         pad_output (bool, optional): Whether to pad the output sequences to a uniform length. If `True`, the output
             sequences will be padded and represented as tensors. If `False`, lists of tokens will be used without
-            padding. Defaults to `True`.
+            padding. Defaults to `False`.
+
+            .. warning:: The default value of `pad_output` differs from :func:`~torchrl.modules.TransformersWrapper`
+                which does not handle non-padded inputs.
+
         inplace (Literal[True, False, "empty"] | None, optional): Determines how the module should handle in-place
             operations. If `True`, operations will be performed in-place. If `False`, a new TensorDict instance will be
             created. If `"empty"`, the output data structure will be initialized with `input.empty()` (i.e., it will
@@ -93,7 +100,7 @@ class vLLMWrapper(CategoricalSequential):
         >>> output_data = wrapper(input_data)
         >>> print(output_data.text_response)
 
-    .. seealso:: :func:`~torchrl.modules.from_hf_transformers` for a similar interface using the Hugging Face
+    .. seealso:: :func:`~torchrl.modules.TransformersWrapper` for a similar interface using the Hugging Face
         Transformers library.
     """
 
@@ -112,12 +119,12 @@ def __init__(
         tokenizer: transformers.tokenization_utils.PreTrainedTokenizer  # noqa
         | None = None,
         # noqa
-        from_text: bool = False,
+        from_text: bool = True,
         device: torch.device | None = None,
         generate: bool = True,
         generate_kwargs: dict | None = None,
         tokenizer_kwargs: dict | None = None,
-        pad_output: bool = True,
+        pad_output: bool = False,
         inplace: Literal[True, False, "empty"] | None = True,
     ):
         super().__init__()
@@ -545,7 +552,7 @@ def get_logprob(output):
             if len(outputs) == 1:
                 self.outputs = outputs[0]
             else:
-                self.outputs = torch.stack(outputs)
+                self.outputs = maybe_dense_stack(outputs)
             self.prompt_logprobs = torch.tensor(
                 [
                     v[tid].logprob if v is not None else 0.0