@@ -147,6 +147,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
147
147
}
148
148
149
149
# LayerNorm-specific variables
150
+ epsilon = config .rms_norm_eps if hasattr (config , "rms_norm_eps" ) else 1e-06
150
151
self .layernorm_attrs = {
151
152
"simple" : True , # Use SimplifiedLayerNorm/SkipSimplifiedLayerNorm vs. LayerNorm/SkipLayerNorm
152
153
"first_layernorm" : True , # 1st LayerNorm = LayerNorm, then SkipLayerNorm for all subsequent LayerNorms
@@ -156,6 +157,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
156
157
"output_0" : "" , # Output 0 for LayerNorm and SkipLayerNorm
157
158
"output_3" : "" , # Output 3 for SkipLayerNorm
158
159
"add_offset" : 0 , # Offset value for LayerNorm weight
160
+ "epsilon" : epsilon , # Epsilon value to avoid `sqrt(0)` in LayerNorm
159
161
}
160
162
161
163
# MatMul-specific variables
@@ -212,6 +214,8 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
212
214
}
213
215
214
216
# Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.)
217
+ softcap = config .attn_logit_softcapping if hasattr (config , "attn_logit_softcapping" ) else 0.0 # default is 0.0 in GroupQueryAttention kernel
218
+
215
219
# Block-sparse attention-specific variables
216
220
sparse_block_size = config .blocksparse_block_size if hasattr (config , "blocksparse_block_size" ) else 0
217
221
kernel_block_size = config .blocksparse_triton_kernel_block_size if hasattr (config , "blocksparse_triton_kernel_block_size" ) else 0
@@ -224,6 +228,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
224
228
"v_path" : "" , # V path to attention
225
229
"op_type" : "MultiHeadAttention" , # Attention op to use
226
230
"scale" : 1 / np .sqrt (self .head_size ), # Scale value after calculating Q x K' in attention
231
+ "softcap" : softcap , # Softcap value to prevent values from exploding in attention
227
232
"use_rotemb_in_attn" : False , # Use rotary embeddings within attention (instead of a separate RotaryEmbedding op)
228
233
"use_packed_matmul" : False , # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
229
234
"block_sparse" : { # Block-sparse attention-specific variables
@@ -969,7 +974,7 @@ def make_layernorm(self, layer_id, layernorm, skip, simple, location):
969
974
970
975
name = f"/model/layers.{ layer_id } /{ location } _layernorm/{ 'Skip' if skip else '' } LayerNorm"
971
976
op_type = f"{ 'Skip' if skip else '' } { 'Simplified' if simple else '' } LayerNormalization"
972
- kwargs = {"epsilon" : 9.999999747378752e-06 }
977
+ kwargs = {"epsilon" : self . layernorm_attrs [ "epsilon" ] }
973
978
if not skip :
974
979
kwargs .update ({"axis" : - 1 , "stash_type" : 1 })
975
980
@@ -1381,7 +1386,7 @@ def make_group_query_attention(self, name, **kwargs):
1381
1386
self .make_node (
1382
1387
"GroupQueryAttention" , inputs = inputs , outputs = outputs , name = name , domain = "com.microsoft" ,
1383
1388
num_heads = self .num_attn_heads , kv_num_heads = self .num_kv_heads , scale = self .attention_attrs ["scale" ], # local_window_size=self.window_size, # Disable sliding window attribute temporarily
1384
- do_rotary = self .attention_attrs ["use_rotemb_in_attn" ], rotary_interleaved = self .rotemb_attrs ["interleaved" ],
1389
+ softcap = self . attention_attrs [ "softcap" ], do_rotary = self .attention_attrs ["use_rotemb_in_attn" ], rotary_interleaved = self .rotemb_attrs ["interleaved" ],
1385
1390
)
1386
1391
self .make_value_info (output , self .io_dtype , shape = ['batch_size' , 'sequence_length' , self .head_size * self .num_attn_heads ])
1387
1392
0 commit comments