meta-pytorch
diff --git a/‎README.md‎
Lines changed: 5 additions & 22 deletions b/‎README.md‎
Lines changed: 5 additions & 22 deletions
diff --git a/‎apps/grpo/main.py‎
Lines changed: 2 additions & 2 deletions b/‎apps/grpo/main.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/api_trainer.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/api_trainer.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/forge/actors/__init__.py‎
Lines changed: 14 additions & 1 deletion b/‎src/forge/actors/__init__.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎src/forge/actors/trainer/__init__.py‎
Lines changed: 23 additions & 0 deletions b/‎src/forge/actors/trainer/__init__.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/forge/actors/trainer.py‎ renamed to ‎src/forge/actors/trainer/titan.py‎
Lines changed: 2 additions & 2 deletions b/‎src/forge/actors/trainer.py‎ renamed to ‎src/forge/actors/trainer/titan.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/forge/controller/provisioner.py‎
Lines changed: 64 additions & 9 deletions b/‎src/forge/controller/provisioner.py‎
Lines changed: 64 additions & 9 deletions
diff --git a/‎src/forge/controller/system_controllers/__init__.py‎
Lines changed: 0 additions & 12 deletions b/‎src/forge/controller/system_controllers/__init__.py‎
Lines changed: 0 additions & 12 deletions
@@ -30,40 +30,23 @@ You can also find our notebook tutorials (coming soon)
 
 ## Installation
 
-### Basic
-
 torchforge requires PyTorch 2.9.0 with [Monarch](https://github.com/meta-pytorch/monarch), [vLLM](https://github.com/vllm-project/vllm), and [torchtitan](https://github.com/pytorch/torchtitan).
 
-You can install Forge with:
-```
-$ conda create -n forge python=3.10
-$ conda activate forge
-$ uv pip install .
-```
-
-(conda-less uv install is a wip)
-
-For your reference, we also include a basic install script that installs other system dependencies
-along with torchforge:
-(note that this basic install script
-uses [DNF](https://docs.fedoraproject.org/en-US/quick-docs/dnf/), but could be easily extended to other Linux OS.)
+Install torchforge with:
 
 ```bash
 conda create -n forge python=3.12
 conda activate forge
 ./scripts/install.sh
 ```
 
-Optional: By default, the packages installation uses conda. If user wants to install system packages on the target machine instead of conda, they can pass the `--use-sudo` to the installation script: `./script/install.sh --use-sudo`.
+The install script installs system dependencies along with torchforge. Note that this install script uses [DNF](https://docs.fedoraproject.org/en-US/quick-docs/dnf/), but could be easily extended to other Linux OS.
 
-After install, you can run the following command and should see output confirming GRPO training is running (you need a minimum 3 GPU devices):
+Optional: By default, the packages installation uses conda. If you want to install system packages on the target machine instead of conda, you can pass the `--use-sudo` flag to the installation script: `./scripts/install.sh --use-sudo`.
 
+> **Note:** We are actively working on enabling pure `uv` installation. Currently, Conda is the recommended approach. `uv` support is not fully working at the moment but is being tracked in [issue #494](https://github.com/meta-pytorch/torchforge/issues/494).
 
-```
-uv run apps/grpo/main.py --config apps/grpo/qwen3_1_7b.yaml
-```
-
-or if not using uv:
+After install, you can run the following command and should see output confirming GRPO training is running (you need a minimum 3 GPU devices):
 
 ```
 python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
 
@@ -23,7 +23,7 @@
 from forge.actors.generator import Generator
 from forge.actors.reference_model import ReferenceModel
 from forge.actors.replay_buffer import ReplayBuffer
-from forge.actors.trainer import RLTrainer
+from forge.actors.trainer import TitanTrainer
 from forge.controller.actor import ForgeActor
 from forge.controller.provisioner import init_provisioner, shutdown
 from forge.data.rewards import MathReward, ThinkingReward
@@ -318,7 +318,7 @@ async def main(cfg: DictConfig):
     ) = await asyncio.gather(
         DatasetActor.options(**cfg.actors.dataset).as_actor(**cfg.dataset),
         Policy.options(**cfg.services.policy).as_service(**cfg.policy),
-        RLTrainer.options(**cfg.actors.trainer).as_actor(
+        TitanTrainer.options(**cfg.actors.trainer).as_actor(
             **cfg.trainer, loss=simple_grpo_loss
         ),
         ReplayBuffer.options(**cfg.actors.replay_buffer).as_actor(
 
@@ -7,17 +7,17 @@
 The Trainer manages model training in TorchForge, built on top of TorchTitan.
 It handles forward/backward passes, weight updates, and checkpoint management for reinforcement learning workflows.
 
-## RLTrainer
+## TitanTrainer
 
 ```{eval-rst}
-.. autoclass:: RLTrainer
+.. autoclass:: TitanTrainer
    :members: train_step, push_weights, cleanup
    :exclude-members: __init__
 ```
 
 ## Configuration
 
-The RLTrainer uses TorchTitan's configuration system with the following components:
+The TitanTrainer uses TorchTitan's configuration system with the following components:
 
 ### Job Configuration
 
 
@@ -96,7 +96,7 @@ graph LR
         S3["RewardActor"]
         S4["ReferenceModel"]
         S5["ReplayBuffer"]
-        S6["RLTrainer"]
+        S6["TitanTrainer"]
     end
 
     C1 --> S1
@@ -306,7 +306,7 @@ TorchForge handles behind the scenes:
 from forge.actors.generator import Generator as Policy
 from forge.actors.replay_buffer import ReplayBuffer
 from forge.actors.reference_model import ReferenceModel
-from forge.actors.trainer import RLTrainer
+from forge.actors.trainer import TitanTrainer
 from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
 from forge.data.rewards import MathReward, ThinkingReward
 import asyncio
@@ -348,7 +348,7 @@ group_size = 1
             }
         ),
         # Trainer actor with GPU
-        RLTrainer.options(procs=1, with_gpus=True).as_actor(
+        TitanTrainer.options(procs=1, with_gpus=True).as_actor(
             # Trainer config would come from YAML in real usage
             model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
             optimizer={"name": "AdamW", "lr": 1e-5},
@@ -378,12 +378,12 @@ group_size = 1
 
 TorchForge has two types of distributed components:
 - **Services**: Multiple replicas with automatic load balancing (like Policy, RewardActor)
-- **Actors**: Single instances that handle their own internal distribution (like RLTrainer, ReplayBuffer)
+- **Actors**: Single instances that handle their own internal distribution (like TitanTrainer, ReplayBuffer)
 
 We cover this distinction in detail in Part 2, but for now this explains the scaling patterns:
 - Policy service: num_replicas=8 for high inference demand
 - RewardActor service: num_replicas=16 for parallel evaluation
-- RLTrainer actor: Single instance with internal distributed training
+- TitanTrainer actor: Single instance with internal distributed training
 
 
 ### Fault Tolerance
 
@@ -470,7 +470,7 @@ async def simple_rl_step():
     if batch is not None:
         print("Training on batch...")
         inputs, targets = batch  # GRPO returns (inputs, targets) tuple
-        loss = await trainer.train_step.call(inputs, targets)  # RLTrainer is an actor
+        loss = await trainer.train_step.call(inputs, targets)  # TitanTrainer is an actor
         print(f"Training loss: {loss}")
         return loss
     else:
@@ -507,7 +507,7 @@ reward_actor = await RewardActor.options(
 )
 
 # Training needs fewer but more powerful replicas
-trainer = await RLTrainer.options(
+trainer = await TitanTrainer.options(
     procs=1, with_gpus=True  # Fewer but GPU-heavy
 ).as_actor(  # Trainer typically uses .as_actor() not .as_service()
     model={"name": "qwen3", "flavor": "1.7B"},
@@ -580,7 +580,7 @@ import torch
 from forge.actors.generator import Generator as Policy
 from forge.actors.reference_model import ReferenceModel
 from forge.actors.replay_buffer import ReplayBuffer
-from forge.actors.trainer import RLTrainer
+from forge.actors.trainer import TitanTrainer
 from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
 from forge.data.rewards import MathReward, ThinkingReward
 
@@ -603,7 +603,7 @@ print("Initializing all services...")
         engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1},
         sampling_config={"n": 1, "max_tokens": 512}
     ),
-    RLTrainer.options(procs=1, with_gpus=True).as_actor(
+    TitanTrainer.options(procs=1, with_gpus=True).as_actor(
         model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"},
         optimizer={"name": "AdamW", "lr": 1e-5},
         training={"local_batch_size": 2, "seq_len": 2048}
@@ -667,7 +667,7 @@ print("Shutting down services...")
 await asyncio.gather(
     DatasetActor.shutdown(dataloader),
     policy.shutdown(),
-    RLTrainer.shutdown(trainer),
+    TitanTrainer.shutdown(trainer),
     ReplayBuffer.shutdown(replay_buffer),
     ComputeAdvantages.shutdown(compute_advantages),
     ReferenceModel.shutdown(ref_model),
 
@@ -4,9 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import warnings
+
 __all__ = [
     "Generator",
-    "RLTrainer",
+    "TitanTrainer",
+    "RLTrainer",  # Deprecated, use TitanTrainer
     "ReplayBuffer",
     "ReferenceModel",
     "SandboxedPythonCoder",
@@ -18,7 +21,17 @@ def __getattr__(name):
         from .generator import Generator
 
         return Generator
+    elif name == "TitanTrainer":
+        from .trainer import TitanTrainer
+
+        return TitanTrainer
     elif name == "RLTrainer":
+        warnings.warn(
+            "RLTrainer is deprecated and will be removed in a future version. "
+            "Please use TitanTrainer instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
         from .trainer import RLTrainer
 
         return RLTrainer
 
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+
+from .titan import TitanTrainer
+
+__all__ = ["TitanTrainer", "RLTrainer"]
+
+
+def __getattr__(name):
+    if name == "RLTrainer":
+        warnings.warn(
+            "RLTrainer is deprecated and will be removed in a future version. "
+            "Please use TitanTrainer instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        return TitanTrainer
+    raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -53,8 +53,8 @@
 
 
 @dataclass
-class RLTrainer(ForgeActor):
-    """A reinforcement learning trainer actor for policy optimization training.
+class TitanTrainer(ForgeActor):
+    """A generic trainer actor implementation built on top of TorchTitan.
 
     Built on top of TorchTitan's training engine, this actor provides a complete training
     loop for reinforcement learning. It performs forward and backward passes with gradient
 
@@ -12,6 +12,8 @@
 import socket
 import uuid
 
+import torch
+
 from monarch._src.actor.actor_mesh import ActorMesh
 from monarch._src.actor.shape import Extent
 
@@ -41,8 +43,19 @@ class _RemoteInfoFetcher(Actor):
 
     @endpoint
     def get_info(self) -> tuple[str, str]:
+        """Returns hostname and port."""
         return socket.gethostname(), _get_port()
 
+    @endpoint
+    def get_gpu_count(self) -> int:
+        """Returns the number of GPUs available on this host."""
+        try:
+            gpu_count = torch.cuda.device_count()
+        except Exception:
+            # If torch is not available or CUDA is not available, assume no GPUs
+            gpu_count = 0
+        return gpu_count
+
 
 class EnvSetter(Actor):
     """Actor to set environment variables on each proc in a mesh.
@@ -87,14 +100,26 @@ async def get_remote_info(host_mesh: HostMesh) -> tuple[str, str]:
     singleton_slice = {k: slice(0, 1) for k in fetcher.extent.keys()}
     fetcher = fetcher.slice(**singleton_slice)
     # Fetcher should be a singleton at this point - call_one() will fail otherwise
-
     host, port = await fetcher.get_info.call_one()
 
     # Stopping this proc is the right thing to do, but Monarch does not yet handle manual stops well.
     # await throwaway_procs.stop()
     return host, port
 
 
+async def get_host_gpus(host_mesh: HostMesh) -> int:
+    """Returns the number of GPUs available on the host mesh."""
+    throwaway_procs = host_mesh.spawn_procs(per_host={"procs": 1})
+    fetcher = throwaway_procs.spawn("_gpu_counter", _RemoteInfoFetcher)
+
+    # Reduce to a singleton
+    singleton_slice = {k: slice(0, 1) for k in fetcher.extent.keys()}
+    fetcher = fetcher.slice(**singleton_slice)
+
+    gpu_count = await fetcher.get_gpu_count.call_one()
+    return gpu_count
+
+
 async def set_environment(proc_mesh: ProcMesh, env_vars: dict[str, str]):
     """Set environment variables on a proc mesh using EnvSetter actor.
 
@@ -112,17 +137,35 @@ async def set_environment(proc_mesh: ProcMesh, env_vars: dict[str, str]):
 class GpuManager:
     """Tracks and assigns GPU devices on a host.
 
-    This currently mimics the `gpu_manager` in system_controllers - we will
-    consolidate as part of the "proper HostMesh integration" work.
+    Args:
+        available_devices: Set of GPU device IDs to manage. If None, uses all devices from 0 to max_device_count-1.
+        max_device_count: Maximum number of GPU devices on this host. Defaults to 8.
 
     """
 
-    def __init__(self, available_devices: set[int] | None = None):
+    def __init__(
+        self, available_devices: set[int] | None = None, max_device_count: int = 8
+    ):
         if available_devices is None:
-            available_devices = set(range(0, 8))
-        assert all(isinstance(x, int) for x in available_devices)
-        assert all(x >= 0 and x < 8 for x in available_devices)
+            available_devices = set(range(0, max_device_count))
+        else:
+            # Validate types first
+            assert all(
+                isinstance(x, int) for x in available_devices
+            ), f"All device IDs must be integers, got: {available_devices}"
+            # When available_devices is provided (e.g., from CUDA_VISIBLE_DEVICES),
+            # adjust max_device_count to accommodate the highest device ID
+            if available_devices:
+                max_device_count = max(max(available_devices) + 1, max_device_count)
+
+        assert all(
+            isinstance(x, int) for x in available_devices
+        ), f"All device IDs must be integers, got: {available_devices}"
+        assert all(
+            x >= 0 for x in available_devices
+        ), f"All device IDs must be non-negative, got: {available_devices}"
         self.available_gpus = available_devices
+        self.max_device_count = max_device_count
 
     def get_available_gpus(self) -> list[str]:
         """Returns a list of available GPU devices."""
@@ -171,8 +214,18 @@ def __init__(self, cfg: ProvisionerConfig | None = None):
                     f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. "
                     f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}"
                 ) from e
+
+        # Get the actual GPU count for the local host
+        try:
+            local_gpu_count = torch.cuda.device_count()
+        except Exception:
+            # If torch is not available or CUDA is not available, assume no GPUs
+            local_gpu_count = 0
+
         self._host_gpu_map = {
-            self._this_host_id: GpuManager(available_local_devices),
+            self._this_host_id: GpuManager(
+                available_local_devices, max_device_count=local_gpu_count
+            ),
         }
         self._proc_host_map = {}
         self._host_mesh_map = {}
@@ -277,7 +330,9 @@ async def get_proc_mesh(
                         num_hosts=num_hosts,
                     )
                     host_id = uuid.uuid1()
-                    gpu_manager = GpuManager()
+                    # Get the GPU count from the remote host
+                    remote_gpu_count = await get_host_gpus(host_mesh)
+                    gpu_manager = GpuManager(max_device_count=remote_gpu_count)
                     self._host_gpu_map[host_id] = gpu_manager
                     host_mesh._host_id = host_id
                 else: