@@ -579,10 +579,14 @@ def args_sanity_check():
579
579
assert (
580
580
not optim_ckpt .overlap_sync_grad & optim_ckpt .overlap_sync_param
581
581
), "not support overlap and moe at the same time"
582
- assert gpc .config .parallel .zero1 .size in (
583
- - 1 ,
584
- gpc .get_world_size (ParallelMode .DATA ),
585
- ) or is_using_fsdp (), "moe only support zero1, set zero1=dict(size=-1,...) can fix this"
582
+ assert (
583
+ gpc .config .parallel .zero1 .size
584
+ in (
585
+ - 1 ,
586
+ gpc .get_world_size (ParallelMode .DATA ),
587
+ )
588
+ or is_using_fsdp ()
589
+ ), "moe only support zero1, set zero1=dict(size=-1,...) can fix this"
586
590
587
591
if gpc .config .parallel .tensor .mode != "isp" :
588
592
assert gpc .config .parallel .expert_weight .size <= 1 , "expert weight parallel is only supported with isp"
@@ -637,11 +641,6 @@ def args_sanity_check():
637
641
assert (
638
642
gpc .config .parallel .weight .size == 1
639
643
), f"fsdp only compatible with weight size = 1, but get weight size = { gpc .config .parallel .weight .size } "
640
- if "expert" in gpc .config .parallel :
641
- assert gpc .config .parallel .expert .size in (
642
- 1 ,
643
- - 1 ,
644
- ), f"fsdp only compatible with expert size = (-1, 1), but get expert size = { gpc .config .parallel .expert .size } "
645
644
if "expert_zero1" in gpc .config .parallel :
646
645
assert gpc .config .parallel .expert_zero1 .size == 1 , (
647
646
f"fsdp only compatible with expert_zero1 size = 1, "
0 commit comments