Support ZBVZeroBubbleSchedule (#817)

H-Huang · tianyu-l · web-flow · commit 3996b634a58c · 2025-02-10T15:57:48.000-05:00
This is dependent on the changes in this pytorch stack: pytorch/pytorch#146217 Add support for running `ZBVZeroBubbleSchedule` and v-shaped CSV schedules in torchtitan Fixes #774 --------- Co-authored-by: tianyu-l <150487191+tianyu-l@users.noreply.github.com>
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
@@ -139,6 +139,18 @@ def build_test_list():
             "pp_looped_zero_bubble",
             ngpu=4,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--experimental.pipeline_parallel_degree 2",
+                    "--experimental.pipeline_parallel_schedule ZBVZeroBubble",
+                    "--experimental.pipeline_parallel_microbatches 8",
+                ],
+            ],
+            "PP zero bubble test (v shaped)",
+            "pp_zbv",
+            ngpu=2,
+        ),
         OverrideDefinitions(
             [
                 [
diff --git a/torchtitan/parallelisms/pipeline_llama.py b/torchtitan/parallelisms/pipeline_llama.py
@@ -13,7 +13,10 @@
 import torch.nn as nn
 from torch.distributed import DeviceMesh
 from torch.distributed.pipelining import PipelineStage
-
+from torch.distributed.pipelining.schedules import (
+    get_schedule_class,
+    ScheduleZBVZeroBubble,
+)
 from torchtitan.config_manager import JobConfig
 from torchtitan.logging import logger
 from torchtitan.models.llama.model import ModelArgs
@@ -43,7 +46,16 @@ def pipeline_llama(
 
     pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
 
-    return pp_schedule, models
+    # This is used in the train loop to determine whether to pass in the input_ids and labels
+    has_first_stage = False
+    has_last_stage = False
+    for stage in stages:
+        if stage.is_first:
+            has_first_stage = True
+        if stage.is_last:
+            has_last_stage = True
+
+    return pp_schedule, models, has_first_stage, has_last_stage
 
 
 def pipeline_llama_manual_split(
@@ -103,7 +115,13 @@ def _build_stage(stage_idx, start_layer, stop_layer, is_first=False, is_last=Fal
 
     stages = []
     models = []
-    for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style="loop"):
+
+    schedule_class = get_schedule_class(
+        job_config.experimental.pipeline_parallel_schedule
+    )
+    style = "v" if schedule_class == ScheduleZBVZeroBubble else "loop"
+
+    for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style):
         start_layer = splits[stage_idx - 1] if stage_idx > 0 else None
         stop_layer = splits[stage_idx] if stage_idx < num_stages - 1 else None
         stage, model_chunk = _build_stage(
diff --git a/train.py b/train.py
@@ -151,7 +151,12 @@ def loss_fn(pred, labels):
     # apply parallelisms and initialization
     if parallel_dims.pp_enabled:
         # apply PT-D Pipeline Parallel
-        pp_schedule, model_parts = models_pipelining_fns[model_name](
+        (
+            pp_schedule,
+            model_parts,
+            has_first_stage,
+            has_last_stage,
+        ) = models_pipelining_fns[model_name](
             model, pp_mesh, parallel_dims, job_config, device, model_config, loss_fn
         )
         # when PP is enabled, `model` obj is no longer used after this point, model_parts is used instead
@@ -285,22 +290,18 @@ def loss_fn(pred, labels):
 
             if parallel_dims.pp_enabled:
                 # Pipeline Parallel forward / backward inside step() call
-                is_last_stage = pp_mesh.get_local_rank() == pp_mesh.size() - 1
-
                 with train_context(optional_context_parallel_ctx):
-                    if pp_mesh.get_local_rank() == 0:
-                        pp_schedule.step(input_ids)
-                    elif is_last_stage:
-                        losses = []
-                        pp_schedule.step(target=labels, losses=losses)
+                    targets, losses = (labels, []) if has_last_stage else (None, None)
+                    if has_first_stage:
+                        pp_schedule.step(input_ids, target=targets, losses=losses)
                     else:
-                        pp_schedule.step()
+                        pp_schedule.step(target=targets, losses=losses)
 
                 # accumulate losses across pipeline microbatches
                 # TODO: PP+FSDP unexpectedly puts the loss back to the CPU
                 loss = (
                     torch.mean(torch.stack(losses)).to(device)
-                    if is_last_stage
+                    if has_last_stage
                     else torch.tensor([-1.0], device=device)
                 )
             else: