InternLM
diff --git a/‎configs/7B_MoE4_sft.py
Lines changed: 6 additions & 5 deletions b/‎configs/7B_MoE4_sft.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎internlm/model/modules/mlp.py
Lines changed: 75 additions & 0 deletions b/‎internlm/model/modules/mlp.py
Lines changed: 75 additions & 0 deletions
diff --git a/‎internlm/model/moe/__init__.py
Lines changed: 2 additions & 0 deletions b/‎internlm/model/moe/__init__.py
Lines changed: 2 additions & 0 deletions
@@ -1,7 +1,7 @@
 JOB_NAME = "7b_moe_train"
 DO_ALERT = False
 
-SEQ_LEN = 2048
+SEQ_LEN = 1024
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 4 / 3
@@ -170,8 +170,9 @@
     # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
     qk_interleaved=False,
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-    moe_type="GShard",  # Support: "GShard", "MegaBlock", "MegaBlock-Dropless", "Dropless"
-    num_experts=4,
+    moe_type="Flux",  # Support: "GShard", "MegaBlock", "MegaBlock-Dropless", "Dropless", "Flux"
+    mlp_layer_fusion=True,
+    num_experts=8,
     top_k=2,
 )
 """
@@ -217,10 +218,10 @@
 """
 parallel = dict(
     zero1=dict(size=-1),
-    tensor=dict(size=1, mode="mtp"),
+    tensor=dict(size=8, mode="msp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
-    expert=dict(size=-1, no_tp=False),
+    expert=dict(size=8, no_tp=True),
     expert_weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
 )
 
 
@@ -8,6 +8,7 @@
 
 from internlm.model.modules.linear import new_linear
 from internlm.model.modules.utils import Gelu, Silu
+from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
 from internlm.utils.utils import ActivationType
 
@@ -259,6 +260,67 @@ def forward(self, x, batch_sizes=None):
         return out
 
 
+class FluxFeedForward(nn.Module):
+    """
+    Flux FeedForward.
+    Args:
+        in_features (int): size of each input sample
+        hidden_features (int): size of hidden state of FFN
+        out_features (int): size of each output sample
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default.
+        mlp_layer_fusion (Optional[Bool]):  Some linears without bias in FFN can be fused to reduce the comm cost of SP.
+        activation_type (str): the activation function used for feed forward, "swiglu" by default.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_features: int = None,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        activation_type: str = "swiglu",
+        num_groups: int = 1,
+        backend: str = "bmm",
+        is_expert: bool = False,
+    ):
+        super().__init__()
+
+        # TODO: support gelu...
+        assert activation_type in ("swiglu"), f"Unsupported activation type: {activation_type}"
+        assert bias is False, "Grouped FeedForward only support bias is False."
+
+        self.w1 = new_linear(
+            "grouped_w1",
+            in_features,
+            hidden_features,
+            bias,
+            device=device,
+            dtype=dtype,
+            num_groups=num_groups,
+            backend=backend,
+            is_expert=is_expert,
+        )
+        self.w2 = new_linear(
+            "grouped_w2",
+            hidden_features,
+            out_features,
+            bias,
+            device=device,
+            dtype=dtype,
+            num_groups=num_groups,
+            backend=backend,
+            is_expert=is_expert,
+        )
+        self._register_load_state_dict_pre_hook(_grouped_mlp_pre_load_convert, with_module=True)
+        self._register_state_dict_hook(_grouped_mlp_save_convert)
+
+
 def new_feed_forward(
     in_features: int,
     hidden_features: int,
@@ -276,6 +338,19 @@ def new_feed_forward(
     if use_grouped_mlp:
         num_groups = kwargs.pop("num_groups", 1)
         backend = kwargs.pop("backend", "bmm")
+        if gpc.config.model.moe_type == "Flux":
+            return FluxFeedForward(
+                in_features,
+                hidden_features,
+                out_features,
+                bias,
+                device,
+                dtype,
+                activation_type,
+                num_groups=num_groups,
+                backend=backend,
+                is_expert=is_expert,
+            )
         return GroupedFeedForward(
             in_features,
             hidden_features,
 
@@ -1,6 +1,7 @@
 from .dropless_layer import DroplessMoELayer
 from .experts import Experts
 from .gshard_layer import GShardMoELayer
+from .flux_layer import FluxMoELayer
 from .megablocks import (
     MegaBlockdMoE,
     MegaBlockFeedForward,
@@ -18,4 +19,5 @@
     "MegaBlockFeedForward",
     "MegaBlockGroupedFeedForward",
     "DroplessMoELayer",
+    "FluxMoELayer",
 ]