[llama4] add auxiliary-loss-free load balancing to MoE token routing (#1114)

tianyu-l · web-flow · commit 5ae50d0fbccc · 2025-04-18T14:28:28.000-07:00
There are two issues in this solution:
1. Communication (sync tokens per expert across all DP ranks) happens on
the default stream. Maybe need to arrange it on FSDP/DDP comm stream.
2. The communication is blocking experts bias update, thus always
exposed.

We need to evaluate if 2 is a problem to performance. 1 is OK if 2 is
acceptable.
diff --git a/torchtitan/experiments/llama4/README.md b/torchtitan/experiments/llama4/README.md
@@ -1,7 +1,10 @@
 **The Llama 4 folder is still under development.**
 
+#### Issue tracking
+https://github.com/pytorch/torchtitan/issues/1118
+
 #### Available features
-- Llama 4 model definition (text-only), including the MoE architecture with token-choice routing using efficient bfloat16 Grouped MM kernels
+- Llama 4 model (text-only), including a token-choice MoE architecture with efficient bfloat16 Grouped MM kernels and auxiliary-loss-free load balancing
 - FSDP, TP, PP, CP support
 - DCP checkpoint conversion scripts
 
@@ -13,17 +16,15 @@ python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E
 
 #### To be added
 - Modeling
-    - iRoPE implementation
-    - load balance loss for token-choice MoE
     - alternative expert-choice MoE
     - multimodal support
 - Parallelism
-    - Context Parallel support for FlexAttention, iRoPE, and multimodal inputs
+    - Context Parallel support for FlexAttention and multimodal inputs
     - Expert Parallel support
 - torch.compile
     - for MoE layers
 - Quantization
-    - efficient float8 GroupedGEMM kernels (from torchao)
+    - efficient float8 Grouped MM kernels (from torchao)
 - Testing
     - perfomance and loss converging tests
     - CI integration
diff --git a/torchtitan/experiments/llama4/infra/parallelize_llama.py b/torchtitan/experiments/llama4/infra/parallelize_llama.py
@@ -21,6 +21,8 @@
 )
 from torchtitan.tools.logging import logger
 
+from ..model.moe import MoE
+
 
 def parallelize_llama(
     model: nn.Module,
@@ -74,17 +76,19 @@ def parallelize_llama(
         # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
         torch._dynamo.config.capture_scalar_outputs = True
 
+    dp_mesh: DeviceMesh | None = None
     if (
         parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
     ):  # apply FSDP or HSDP, potentially with Context Parallel
         if parallel_dims.dp_replicate_enabled:
             dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
         else:
             dp_mesh_dim_names = ("dp_shard_cp",)
+        dp_mesh = world_mesh[tuple(dp_mesh_dim_names)]
 
         apply_fsdp(
             model,
-            world_mesh[tuple(dp_mesh_dim_names)],
+            dp_mesh,
             param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
             reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
             pp_enabled=parallel_dims.pp_enabled,
@@ -105,13 +109,36 @@ def parallelize_llama(
     elif parallel_dims.dp_replicate_enabled:
         if world_mesh.ndim > 1:
             raise RuntimeError("DDP has not supported > 1D parallelism")
+        dp_mesh = world_mesh
         apply_ddp(
             model,
-            world_mesh,
+            dp_mesh,
             enable_compile=job_config.training.compile,
             enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
         )
 
+    # for MoE auxiliary-loss-free load balancing
+    if dp_mesh is not None:
+        # NOTE: Currently this sync is blocking (thus exposed) and happens on the
+        # default compute stream. Need to assess if this is OK performance-wise.
+        def _sync_tokens_per_expert(module, *_):
+            assert isinstance(module, MoE)
+            torch.distributed.all_reduce(
+                module.tokens_per_expert, group=dp_mesh.get_group()
+            )
+
+        for transformer_block in model.layers.values():
+            if transformer_block.moe_enabled:
+                load_balance_coeff = transformer_block.moe.load_balance_coeff
+                if load_balance_coeff is not None and load_balance_coeff > 0:
+                    # prepend=True so that the sync runs before
+                    # the _update_expert_bias hook in MoE
+                    transformer_block.moe.register_full_backward_hook(
+                        _sync_tokens_per_expert, prepend=True
+                    )
+                else:
+                    break
+
     return model
 
 
@@ -127,7 +154,7 @@ def apply_moe_tp(
 
     from .expert_parallel import NoParallel, TensorParallel
 
-    for _, transformer_block in model.layers.items():
+    for transformer_block in model.layers.values():
         moe_layer_plan = {
             # input / output sharding on the seqlen dim
             # all-gather for input, reduce-scatter for output
diff --git a/torchtitan/experiments/llama4/model/args.py b/torchtitan/experiments/llama4/model/args.py
@@ -56,6 +56,7 @@ class TransformerModelArgs(BaseModelArgs):
     # token-choice
     top_k: int = 1
     use_grouped_mm: bool = True  # grouped mm or for-loop for the experts computation
+    load_balance_coeff: float | None = 1e-3
 
     def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
         self.vocab_size = tokenizer.n_words
diff --git a/torchtitan/experiments/llama4/model/model.py b/torchtitan/experiments/llama4/model/model.py
@@ -341,12 +341,12 @@ def forward(
             out = h + self.feed_forward(self.ffn_norm(h))
         return out
 
-    def init_weights(self):
+    def init_weights(self, buffer_device: torch.device):
         for norm in (self.attention_norm, self.ffn_norm):
             norm.reset_parameters()
         self.attention.init_weights(self.weight_init_std)
         if self.moe_enabled:
-            self.moe.init_weights(self.weight_init_std)
+            self.moe.init_weights(self.weight_init_std, buffer_device)
         else:
             self.feed_forward.init_weights(self.weight_init_std)
 
@@ -417,7 +417,7 @@ def init_weights(
             nn.init.normal_(self.tok_embeddings.weight)
         for layer in self.layers.values():
             if layer is not None:
-                layer.init_weights()
+                layer.init_weights(buffer_device=buffer_device)
         if self.norm is not None:
             self.norm.reset_parameters()
         final_out_std = self.model_args.dim**-0.5
diff --git a/torchtitan/experiments/llama4/model/moe.py b/torchtitan/experiments/llama4/model/moe.py
@@ -120,7 +120,7 @@ def __init__(
         self.use_sigmoid = use_sigmoid
 
     def forward(
-        self, x: torch.Tensor
+        self, x: torch.Tensor, expert_bias: torch.Tensor = None
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Args:
@@ -139,13 +139,17 @@ def forward(
 
         # By default, sigmoid or softmax is performed in float32 to avoid loss explosion
         if self.use_sigmoid:
-            scores = torch.sigmoid(scores.to(torch.float32)).to(x.dtype)
+            scores = torch.sigmoid(scores.to(torch.float32))
         else:
-            scores = F.softmax(scores.to(torch.float32), dim=1).to(x.dtype)
+            scores = F.softmax(scores.to(torch.float32), dim=1)
 
         # top scores shape (bs*slen, top_k)
-        top_scores, selected_experts_indices = torch.topk(scores, k=self.top_k, dim=1)
-        # top_scores /= top_scores.sum(dim=-1, keep_dim=True).to(x.dtype)
+        # NOTE: The expert_bias is only used for routing. The gating value
+        #       top_scores is still derived from the original scores.
+        _, selected_experts_indices = torch.topk(
+            scores + expert_bias, k=self.top_k, dim=1
+        )
+        top_scores = scores.gather(dim=1, index=selected_experts_indices)
 
         # group tokens together by expert indices from 0 to num_experts and pass that to experts forward
         num_local_tokens_per_expert = torch.histc(
@@ -167,7 +171,6 @@ def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std)
 
 
-# TODO: implement load balancing auxiliary loss for token-choice routing
 class MoE(nn.Module):
     def __init__(self, model_args: TransformerModelArgs):
         super().__init__()
@@ -209,6 +212,35 @@ def __init__(self, model_args: TransformerModelArgs):
             else None
         )
 
+        # auxiliary-loss-free load balancing
+        self.load_balance_coeff = model_args.load_balance_coeff
+        # the fields below are defined even when load_balance_coeff is None
+        # to make initialization and checkpointing code simpler
+        self.register_buffer(
+            "expert_bias",
+            torch.zeros(num_experts, dtype=torch.float32),
+            persistent=True,
+        )
+        self.register_buffer(
+            "tokens_per_expert",
+            torch.zeros(num_experts, dtype=torch.float32),
+            persistent=True,
+        )
+
+        # NOTE: forward hook, forward pre hook, or backward pre hook
+        #       would conflict with activation checkpointing
+        if self.load_balance_coeff is not None and self.load_balance_coeff > 0:
+            self.register_full_backward_hook(self._update_expert_bias)
+
+    def _update_expert_bias(self, *_):
+        expert_bias_delta = self.load_balance_coeff * torch.sign(
+            self.tokens_per_expert.mean() - self.tokens_per_expert
+        )
+        expert_bias_delta = expert_bias_delta - expert_bias_delta.mean()
+        self.expert_bias = self.expert_bias + expert_bias_delta
+
+        self.tokens_per_expert.zero_()
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Args:
@@ -218,13 +250,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
         """
         bs, slen, dim = x.shape
+
         # top_scores and selected_indices shape (bs*slen*top_k,)
         # num_local_tokens_per_expert shape (num_experts,)
         (
             top_scores,
             token_indices,
             num_local_tokens_per_expert,
-        ) = self.router(x.reshape(bs * slen, dim))
+        ) = self.router(x.reshape(bs * slen, dim), self.expert_bias)
+
+        # will be used to update the expert bias for load balancing
+        self.tokens_per_expert += num_local_tokens_per_expert
 
         # shape (bs*slen*top_k, dim)
         token_indices = token_indices.reshape(-1, 1).expand(-1, dim)
@@ -235,7 +271,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             dim=0,
             index=token_indices,
         )
-        routed_input = routed_input * top_scores.reshape(-1, 1)
+        routed_input = (routed_input.to(torch.float32) * top_scores.reshape(-1, 1)).to(
+            x.dtype
+        )
 
         if self.use_grouped_mm:
             # NOTE: In order to use torch._grouped_mm, we need to make sure
@@ -285,8 +323,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         out = out.reshape(bs, slen, dim)
         return out
 
-    def init_weights(self, init_std: float):
+    def init_weights(
+        self,
+        init_std: float,
+        buffer_device: torch.device,
+    ):
         self.experts.init_weights(init_std)
         self.router.init_weights(init_std)
         if self.shared_expert is not None:
             self.shared_expert.init_weights(init_std)
+
+        with torch.device(buffer_device):
+            self.expert_bias = torch.zeros(
+                self.experts.num_experts, dtype=torch.float32
+            )
+            self.tokens_per_expert = torch.zeros(
+                self.experts.num_experts, dtype=torch.float32
+            )
diff --git a/torchtitan/models/llama3/model.py b/torchtitan/models/llama3/model.py
@@ -8,7 +8,6 @@
 
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -25,10 +24,10 @@ class TransformerModelArgs(BaseModelArgs):
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
-    n_kv_heads: Optional[int] = None
+    n_kv_heads: int | None = None
     vocab_size: int = -1  # defined later by tokenizer
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    ffn_dim_multiplier: Optional[float] = None
+    ffn_dim_multiplier: float | None = None
     norm_eps: float = 1e-5
     rope_theta: float = 10000
 
@@ -93,7 +92,7 @@ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Te
     Args:
         dim (int): Dimension of the frequency tensor.
         end (int): End index for precomputing frequencies.
-        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        theta (float | None): Scaling factor for frequency computation. Defaults to 10000.0.
 
     Returns:
         torch.Tensor: Precomputed frequency tensor with complex exponentials.
@@ -271,7 +270,7 @@ class FeedForward(nn.Module):
         dim (int): Input dimension.
         hidden_dim (int): Hidden dimension of the feedforward layer.
         multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
-        ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None.
+        ffn_dim_multiplier (float | None): Custom multiplier for hidden dimension. Defaults to None.
 
     Attributes:
         w1 (Linear): Linear transformation for the first layer.
@@ -285,7 +284,7 @@ def __init__(
         dim: int,
         hidden_dim: int,
         multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
+        ffn_dim_multiplier: float | None,
     ):
         super().__init__()
         hidden_dim = int(2 * hidden_dim / 3)
@@ -419,7 +418,7 @@ def __init__(self, model_args: TransformerModelArgs):
 
     def init_weights(
         self,
-        buffer_device: Optional[torch.device] = None,
+        buffer_device: torch.device | None = None,
     ):
         """
         [Note: On ``init_weights`` vs. ``reset_parameters``]
diff --git a/torchtitan/models/llama3/parallelize_llama.py b/torchtitan/models/llama3/parallelize_llama.py
@@ -175,7 +175,7 @@ def apply_tp(
     # NOTE: At the cost of model code change, we can accelerate Sequence Parallel
     #       by folding (and unfolding) the batch dimension and the sequence dimension.
     #       Examples can be found at https://github.com/pytorch/torchtitan/pull/437
-    for layer_id, transformer_block in model.layers.items():
+    for transformer_block in model.layers.values():
         layer_plan = {
             "attention_norm": SequenceParallel(),
             "attention": prepare_module_input(