Fix llama gqa attention bias (IBM#88)

njhill · Xaenalt · commit e908eec0230c · 2024-08-12T10:02:47.000-04:00
To support IBM granite code 8b models

Signed-off-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -155,10 +155,9 @@ def _load_gqa(config, prefix: str, weights):
     assert config.hidden_size % config.num_attention_heads == 0
     assert config.num_attention_heads % weights.process_group.size() == 0
 
+    prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
     weight = weights.get_multi_weights_col(
-        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
-        dim=0
+        prefixes=prefixes, quantize=config.quantize, dim=0
     )
 
     if config.quantize != "gptq":
@@ -172,7 +171,12 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
+    if config.attention_bias:
+        bias = torch.cat([weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes], dim=0)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias, quantize=config.quantize))
 
 
 class FlashLlamaAttention(torch.nn.Module):
diff --git a/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py b/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py
@@ -156,10 +156,9 @@ def _load_gqa(config, prefix: str, weights):
     assert config.hidden_size % config.num_attention_heads == 0
     assert config.num_attention_heads % weights.process_group.size() == 0
 
+    prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
     weight = weights.get_multi_weights_col(
-        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
-        dim=0
+        prefixes=prefixes, quantize=config.quantize, dim=0
     )
 
     if config.quantize != "gptq":
@@ -173,7 +172,12 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
+    if config.attention_bias:
+        bias = torch.cat([weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes], dim=0)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias, quantize=config.quantize))
 
 
 class PagedLlamaAttention(torch.nn.Module):