update fsdp wrap

zigzagcai · zigzagcai · commit 8e04b09f7a81 · 2025-03-12T15:24:50.000+08:00
diff --git a/internlm/core/fsdp.py b/internlm/core/fsdp.py
@@ -1,5 +1,4 @@
 import collections
-import functools
 import itertools
 from typing import List, Optional, Set, Union
 
@@ -11,7 +10,7 @@
     BackwardPrefetch,
     ShardingStrategy,
 )
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 
 from internlm.accelerator.abstract_accelerator import get_accelerator
 from internlm.core.context import ParallelMode
@@ -170,7 +169,7 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
                 module=model,
                 process_group=gpc.get_group(ParallelMode.GLOBAL),
                 sharding_strategy=ShardingStrategy.FULL_SHARD,  # ZeRO2: SHARD_GRAD_OP, ZeRO3: FULL_SHARD
-                auto_wrap_policy=functools.partial(transformer_auto_wrap_policy, transformer_layer_cls=set(wrap_cls)),
+                auto_wrap_policy=ModuleWrapPolicy(wrap_cls),
                 sync_module_states=fsdp_init_method != "cuda",  # sync model paramters
                 forward_prefetch=True,
                 backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
diff --git a/internlm/initialize/initialize_launcher.py b/internlm/initialize/initialize_launcher.py
@@ -579,10 +579,14 @@ def args_sanity_check():
         assert (
             not optim_ckpt.overlap_sync_grad & optim_ckpt.overlap_sync_param
         ), "not support overlap and moe at the same time"
-        assert gpc.config.parallel.zero1.size in (
-            -1,
-            gpc.get_world_size(ParallelMode.DATA),
-        ) or is_using_fsdp(), "moe only support zero1, set zero1=dict(size=-1,...) can fix this"
+        assert (
+            gpc.config.parallel.zero1.size
+            in (
+                -1,
+                gpc.get_world_size(ParallelMode.DATA),
+            )
+            or is_using_fsdp()
+        ), "moe only support zero1, set zero1=dict(size=-1,...) can fix this"
 
         if gpc.config.parallel.tensor.mode != "isp":
             assert gpc.config.parallel.expert_weight.size <= 1, "expert weight parallel is only supported with isp"
@@ -637,11 +641,6 @@ def args_sanity_check():
         assert (
             gpc.config.parallel.weight.size == 1
         ), f"fsdp only compatible with weight size = 1, but get weight size = {gpc.config.parallel.weight.size}"
-        if "expert" in gpc.config.parallel:
-            assert gpc.config.parallel.expert.size in (
-                1,
-                -1,
-            ), f"fsdp only compatible with expert size = (-1, 1), but get expert size = {gpc.config.parallel.expert.size}"
         if "expert_zero1" in gpc.config.parallel:
             assert gpc.config.parallel.expert_zero1.size == 1, (
                 f"fsdp only compatible with expert_zero1 size = 1, "