pd: fix local_rank and in mutlti nodes training (#4811)

HydrogenSulfate · Copilot · web-flow · commit ba09114a3a9c · 2025-06-19T15:44:15.000Z
1. get local rank from `PADDLE_LOCAL_RANK` environment variable instead
of `get_rank()`(which will return global rank).
2. disable gradient synchronization in forward-backward and synchronize
manually before optimizer update
4. update parallel training tutorial(multi-node multi-GPU) in document

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **Bug Fixes**
- Improved gradient synchronization in distributed training for
multi-process setups.
- Updated local rank assignment to use environment variables for
enhanced compatibility.

- **Documentation**
- Added an example using `mpirun` and a sample shell script to the
parallel training guide for distributed training launch.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: HydrogenSulfate &lt;490868991@qq.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import contextlib
 import functools
 import logging
 import time
@@ -18,6 +19,7 @@
 from paddle.distributed import (
     fleet,
 )
+from paddle.distributed.fleet.utils import hybrid_parallel_util as hpu
 from paddle.framework import (
     core,
 )
@@ -741,16 +743,30 @@ def step(_step_id, task_key="Default") -> None:
                     pref_lr = _lr.start_lr
                 else:
                     pref_lr = cur_lr
-                with nvprof_context(enable_profiling, "Forward pass"):
-                    model_pred, loss, more_loss = self.wrapper(
-                        **input_dict,
-                        cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
-                        label=label_dict,
-                        task_key=task_key,
-                    )
 
-                with nvprof_context(enable_profiling, "Backward pass"):
-                    loss.backward()
+                # disable synchronization in forward-backward manually
+                # as derivatives exist in model forward
+                no_sync_context = (
+                    self.wrapper.no_sync
+                    if self.world_size > 1
+                    else contextlib.nullcontext
+                )
+                with no_sync_context():
+                    with nvprof_context(enable_profiling, "Forward pass"):
+                        model_pred, loss, more_loss = self.wrapper(
+                            **input_dict,
+                            cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
+                            label=label_dict,
+                            task_key=task_key,
+                        )
+
+                    with nvprof_context(enable_profiling, "Backward pass"):
+                        loss.backward()
+
+                # fuse + allreduce manually before optimization if use DDP + no_sync
+                # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
+                if self.world_size > 1:
+                    hpu.fused_allreduce_gradients(list(self.wrapper.parameters()), None)
 
                 if self.gradient_max_norm > 0.0:
                     with nvprof_context(enable_profiling, "Gradient clip"):
diff --git a/deepmd/pd/utils/env.py b/deepmd/pd/utils/env.py
@@ -27,7 +27,7 @@
     ncpus = os.cpu_count()
 NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(0, ncpus)))
 # Make sure DDP uses correct device if applicable
-LOCAL_RANK = paddle.distributed.get_rank()
+LOCAL_RANK = int(os.environ.get("PADDLE_LOCAL_RANK", 0))
 
 if os.environ.get("DEVICE") == "cpu" or paddle.device.cuda.device_count() <= 0:
     DEVICE = "cpu"
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
@@ -218,6 +218,21 @@ NUM_WORKERS=0 HDF5_USE_FILE_LOCKING=0 python -m paddle.distributed.launch \
     dp --pd train input.json
 ```
 
+or you can wrapper the training script with `mpirun`:
+
+```bash
+# ----- train_pp.sh -------
+unset CUDA_DEVICE_MAX_CONNECTIONS
+python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir logs dp --pd train input_torch.json -l train_pp.log
+# -------------------------
+```
+
+Then, run the script on the first node with:
+
+```bash
+mpirun run_pp.sh
+```
+
 :::{note}
 
 If `NUM_WORKERS` is too large, it may cause the program to be terminated by the system;