Bring back custom policy to support bias-less OPT-like models

Marcin Kardas · Marcin Kardas · commit 7ef4dfc006c2 · 2023-01-19T13:08:46.000Z
diff --git a/galai/model.py b/galai/model.py
@@ -129,9 +129,15 @@ def _parallelize(self) -> None:
 
         self._master_port = 13000 + (id(self.model) % 32749)
 
+        custom_policies = None
+        if self.model.config.model_type == "opt" and not self.model.config.enable_bias:
+            from galai.parallel_policy import OPTDecoderLayerPolicyNoBias
+            custom_policies = [OPTDecoderLayerPolicyNoBias]
+
         parallelize(
             self.model, num_gpus=self.num_gpus, fp16=self.dtype == torch.float16,
             master_port=self._master_port,
+            custom_policies=custom_policies,
         )
 
     def _set_tokenizer(self, tokenizer_path: str):
diff --git a/galai/parallel_policy.py b/galai/parallel_policy.py
@@ -0,0 +1,60 @@
+from parallelformers.policies.base import Layer, Policy
+from parallelformers.utils.dist_utils import AllReduceLinear
+
+from transformers.models.opt.modeling_opt import OPTDecoderLayer
+
+
+__all__ = ["OPTDecoderLayerPolicyNoBias"]
+
+
+class OPTDecoderLayerPolicyNoBias(Policy):
+    @staticmethod
+    def replace_arguments(config, world_size):
+        return {
+            "self_attn.embed_dim": config.hidden_size // world_size,
+            "self_attn.num_heads": config.num_attention_heads // world_size,
+        }
+
+    @staticmethod
+    def attn_qkv():
+        return [
+            Layer(
+                weight="self_attn.q_proj.weight",
+            ),
+            Layer(
+                weight="self_attn.k_proj.weight",
+            ),
+            Layer(
+                weight="self_attn.v_proj.weight",
+            ),
+        ]
+
+    @staticmethod
+    def attn_out():
+        return [
+            Layer(
+                weight="self_attn.out_proj.weight",
+                replace=AllReduceLinear,
+            ),
+        ]
+
+    @staticmethod
+    def mlp_in():
+        return [
+            Layer(
+                weight="fc1.weight",
+            ),
+        ]
+
+    @staticmethod
+    def mlp_out():
+        return [
+            Layer(
+                weight="fc2.weight",
+                replace=AllReduceLinear,
+            ),
+        ]
+
+    @staticmethod
+    def original_layer_class():
+        return OPTDecoderLayer