fix comments

anzr299 · anzr299 · commit 5ab00f5d45a0 · 2025-02-06T15:57:25.000+04:00
diff --git a/nncf/experimental/torch2/function_hook/handle_inner_functions.py b/nncf/experimental/torch2/function_hook/handle_inner_functions.py
@@ -64,19 +64,13 @@ def multi_head_attention_forward(
 ) -> Tuple[Tensor, Optional[Tensor]]:
 
     is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
-
-    # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
-    # is batched, run the computation and before returning squeeze the
-    # batch dimension so that the output doesn't carry this temporary batch dimension.
     if not is_batched:
-        # unsqueeze if the input is unbatched
         query = query.unsqueeze(1)
         key = key.unsqueeze(1)
         value = value.unsqueeze(1)
         if key_padding_mask is not None:
             key_padding_mask = key_padding_mask.unsqueeze(0)
 
-    # set up shape vars
     tgt_len, bsz, embed_dim = query.shape
     src_len, _, _ = key.shape
 
@@ -96,9 +90,6 @@ def multi_head_attention_forward(
         )
 
     if is_causal and key_padding_mask is None and not need_weights:
-        # when we have a kpm or need weights, we need attn_mask
-        # Otherwise, we use the is_causal hint go as is_causal
-        # indicator to SDPA.
         attn_mask = None
     else:
         attn_mask = _canonical_mask(
@@ -111,31 +102,23 @@ def multi_head_attention_forward(
         )
 
         if key_padding_mask is not None:
-            # We have the attn_mask, and use that to merge kpm into it.
-            # Turn off use of is_causal hint, as the merged mask is no
-            # longer causal.
             is_causal = False
 
     assert (
         embed_dim == embed_dim_to_check
     ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
     if isinstance(embed_dim, torch.Tensor):
-        # embed_dim can be a tensor when JIT tracing
         head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
     else:
         head_dim = embed_dim // num_heads
     assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
     if use_separate_proj_weight:
-        # allow MHA to have different embedding dimensions when separate projection weights are used
         assert (
             key.shape[:2] == value.shape[:2]
         ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
     else:
         assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
 
-    #
-    # compute in-projection
-    #
     if not use_separate_proj_weight:
         assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
         q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
@@ -159,10 +142,7 @@ def multi_head_attention_forward(
             b_v,
         )
 
-    # prep attention mask
-
     if attn_mask is not None:
-        # ensure attn_mask's dim is 3
         if attn_mask.dim() == 2:
             correct_2d_size = (tgt_len, src_len)
             if attn_mask.shape != correct_2d_size:
@@ -179,7 +159,6 @@ def multi_head_attention_forward(
         else:
             raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
 
-    # add bias along batch dimension (currently second)
     if bias_k is not None and bias_v is not None:
         assert static_k is None, "bias cannot be added to static key."
         assert static_v is None, "bias cannot be added to static value."
@@ -193,14 +172,10 @@ def multi_head_attention_forward(
         assert bias_k is None
         assert bias_v is None
 
-    #
-    # reshape q, k, v for multihead attention and make them batch first
-    #
     q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
     if static_k is None:
         k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
-        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
         assert (
             static_k.size(0) == bsz * num_heads
         ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
@@ -209,14 +184,12 @@ def multi_head_attention_forward(
     if static_v is None:
         v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
-        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
         assert (
             static_v.size(0) == bsz * num_heads
         ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
         assert static_v.size(2) == head_dim, f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
         v = static_v
 
-    # add zero attention along batch dimension (now first)
     if add_zero_attn:
         zero_attn_shape = (bsz * num_heads, 1, head_dim)
         k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
@@ -226,10 +199,8 @@ def multi_head_attention_forward(
         if key_padding_mask is not None:
             key_padding_mask = pad(key_padding_mask, (0, 1))
 
-    # update source sequence length after adjustments
     src_len = k.size(1)
 
-    # merge key padding and attention masks
     if key_padding_mask is not None:
         if not torch.jit.is_scripting() and not torch.jit.is_tracing():
             _check_key_padding_mask(key_padding_mask, src_len, bsz)
@@ -242,16 +213,11 @@ def multi_head_attention_forward(
         else:
             attn_mask = attn_mask + key_padding_mask
 
-    # adjust dropout probability
     if not training:
         dropout_p = 0.0
 
-    #
-    # (deep breath) calculate attention and out projection
-    #
-
     if need_weights:
-        _B, _Nt, E = q.shape
+        _B, _Nt, E = q.shape # noqa: F841
         q_scaled = q * math.sqrt(1.0 / float(E))
 
         assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
@@ -270,20 +236,15 @@ def multi_head_attention_forward(
         attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
         attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
 
-        # optionally average attention weights over heads
         attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
         if average_attn_weights:
             attn_output_weights = attn_output_weights.mean(dim=1)
 
         if not is_batched:
-            # squeeze the output if input was unbatched
             attn_output = attn_output.squeeze(1)
             attn_output_weights = attn_output_weights.squeeze(0)
         return attn_output, attn_output_weights
     else:
-        # attn_mask can be either (L,S) or (N*num_heads, L, S)
-        # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
-        # in order to match the input for SDPA of (N, num_heads, L, S)
         if attn_mask is not None:
             if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
                 attn_mask = attn_mask.unsqueeze(0)
@@ -300,7 +261,6 @@ def multi_head_attention_forward(
         attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
         attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
         if not is_batched:
-            # squeeze the output if input was unbatched
             attn_output = attn_output.squeeze(1)
         return attn_output, None
 
diff --git a/tests/torch/experimental/search_building_blocks/test_transformer_blocks.py b/tests/torch/experimental/search_building_blocks/test_transformer_blocks.py
@@ -91,22 +91,22 @@ def forward(self, x):
     TransformerSearchBBlockParamsCase(
         name="BERT",
         input_info=[dict(sample_size=[1, 10], type="long")],
-        model_creator=partial(AutoModelForQuestionAnswering.from_config, BertConfig()),
+        model_creator=partial(AutoModelForQuestionAnswering.from_config, BertConfig(), attn_implementation="eager"),
     ),
     TransformerSearchBBlockParamsCase(
         name="ViT",
         input_info=dict(sample_size=[1, 3, 224, 224]),
-        model_creator=partial(AutoModelForImageClassification.from_config, ViTConfig()),
+        model_creator=partial(AutoModelForImageClassification.from_config, ViTConfig(), attn_implementation="eager"),
     ),
     TransformerSearchBBlockParamsCase(
         name="wave2vec 2.0",
         input_info=dict(sample_size=[1, 400]),
-        model_creator=partial(AutoModelForAudioClassification.from_config, Wav2Vec2Config()),
+        model_creator=partial(AutoModelForAudioClassification.from_config, Wav2Vec2Config(), attn_implementation="eager"),
     ),
     TransformerSearchBBlockParamsCase(
         name="SWIN MS",
         input_info=dict(sample_size=[1, 3, 224, 224]),
-        model_creator=partial(AutoModelForImageClassification.from_config, SwinConfig()),
+        model_creator=partial(AutoModelForImageClassification.from_config, SwinConfig(), attn_implementation="eager"),
     ),
     TransformerSearchBBlockParamsCase(
         name="one MHSA",