pre-commit fix

anzr299 · anzr299 · commit ed44bd492e62 · 2025-02-06T15:29:59.000+04:00
diff --git a/nncf/experimental/torch2/function_hook/handle_inner_functions.py b/nncf/experimental/torch2/function_hook/handle_inner_functions.py
@@ -63,9 +63,7 @@ def multi_head_attention_forward(
     is_causal: bool = False,
 ) -> Tuple[Tensor, Optional[Tensor]]:
 
-    is_batched = _mha_shape_check(
-        query, key, value, key_padding_mask, attn_mask, num_heads
-    )
+    is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
 
     # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
     # is batched, run the computation and before returning squeeze the
@@ -126,37 +124,25 @@ def multi_head_attention_forward(
         head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
     else:
         head_dim = embed_dim // num_heads
-    assert (
-        head_dim * num_heads == embed_dim
-    ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
+    assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
     if use_separate_proj_weight:
         # allow MHA to have different embedding dimensions when separate projection weights are used
         assert (
             key.shape[:2] == value.shape[:2]
         ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
     else:
-        assert (
-            key.shape == value.shape
-        ), f"key shape {key.shape} does not match value shape {value.shape}"
+        assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
 
     #
     # compute in-projection
     #
     if not use_separate_proj_weight:
-        assert (
-            in_proj_weight is not None
-        ), "use_separate_proj_weight is False but in_proj_weight is None"
+        assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
         q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
     else:
-        assert (
-            q_proj_weight is not None
-        ), "use_separate_proj_weight is True but q_proj_weight is None"
-        assert (
-            k_proj_weight is not None
-        ), "use_separate_proj_weight is True but k_proj_weight is None"
-        assert (
-            v_proj_weight is not None
-        ), "use_separate_proj_weight is True but v_proj_weight is None"
+        assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
+        assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
+        assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
         if in_proj_bias is None:
             b_q = b_k = b_v = None
         else:
@@ -191,9 +177,7 @@ def multi_head_attention_forward(
                     f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}."
                 )
         else:
-            raise RuntimeError(
-                f"attn_mask's dimension {attn_mask.dim()} is not supported"
-            )
+            raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
 
     # add bias along batch dimension (currently second)
     if bias_k is not None and bias_v is not None:
@@ -220,9 +204,7 @@ def multi_head_attention_forward(
         assert (
             static_k.size(0) == bsz * num_heads
         ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
-        assert (
-            static_k.size(2) == head_dim
-        ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
+        assert static_k.size(2) == head_dim, f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
         k = static_k
     if static_v is None:
         v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
@@ -231,20 +213,14 @@ def multi_head_attention_forward(
         assert (
             static_v.size(0) == bsz * num_heads
         ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
-        assert (
-            static_v.size(2) == head_dim
-        ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
+        assert static_v.size(2) == head_dim, f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
         v = static_v
 
     # add zero attention along batch dimension (now first)
     if add_zero_attn:
         zero_attn_shape = (bsz * num_heads, 1, head_dim)
-        k = torch.cat(
-            [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1
-        )
-        v = torch.cat(
-            [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1
-        )
+        k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
         if attn_mask is not None:
             attn_mask = pad(attn_mask, (0, 1))
         if key_padding_mask is not None:
@@ -259,9 +235,7 @@ def multi_head_attention_forward(
             _check_key_padding_mask(key_padding_mask, src_len, bsz)
 
         key_padding_mask = (
-            key_padding_mask.view(bsz, 1, 1, src_len)
-            .expand(-1, num_heads, -1, -1)
-            .reshape(bsz * num_heads, 1, src_len)
+            key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
         )
         if attn_mask is None:
             attn_mask = key_padding_mask
@@ -280,14 +254,10 @@ def multi_head_attention_forward(
         _B, _Nt, E = q.shape
         q_scaled = q * math.sqrt(1.0 / float(E))
 
-        assert not (
-            is_causal and attn_mask is None
-        ), "FIXME: is_causal not implemented for need_weights"
+        assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
 
         if attn_mask is not None:
-            attn_output_weights = torch.baddbmm(
-                attn_mask, q_scaled, k.transpose(-2, -1)
-            )
+            attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
         else:
             attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
         attn_output_weights = softmax(attn_output_weights, dim=-1)
@@ -296,9 +266,7 @@ def multi_head_attention_forward(
 
         attn_output = torch.bmm(attn_output_weights, v)
 
-        attn_output = (
-            attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
-        )
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
         attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
         attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
 
@@ -326,12 +294,8 @@ def multi_head_attention_forward(
         k = k.view(bsz, num_heads, src_len, head_dim)
         v = v.view(bsz, num_heads, src_len, head_dim)
 
-        attn_output = scaled_dot_product_attention(
-            q, k, v, attn_mask, dropout_p, is_causal
-        )
-        attn_output = (
-            attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
-        )
+        attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
 
         attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
         attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
diff --git a/tests/torch/pruning/experimental/test_nodes_grouping.py b/tests/torch/pruning/experimental/test_nodes_grouping.py
@@ -180,7 +180,9 @@ def __str__(self) -> str:
         model_desc=GeneralModelDesc(
             model_name="1_layer_BERT",
             input_info=[dict(sample_size=[1, 10], type="long")],
-            model_builder=partial(AutoModelForQuestionAnswering.from_config, BertConfig(num_hidden_layers=1), attn_implementation="eager"),
+            model_builder=partial(
+                AutoModelForQuestionAnswering.from_config, BertConfig(num_hidden_layers=1), attn_implementation="eager"
+            ),
         ),
         ref_groups=[
             PruningGroup(
@@ -226,7 +228,11 @@ def __str__(self) -> str:
         model_desc=GeneralModelDesc(
             model_name="RoBERTa",
             input_info=[dict(sample_size=[1, 10], type="long")],
-            model_builder=partial(AutoModelForQuestionAnswering.from_config, RobertaConfig(num_hidden_layers=1), attn_implementation="eager"),
+            model_builder=partial(
+                AutoModelForQuestionAnswering.from_config,
+                RobertaConfig(num_hidden_layers=1),
+                attn_implementation="eager",
+            ),
         ),
         ref_groups=[
             PruningGroup(
@@ -483,7 +489,7 @@ def test_groups(desc: GroupTestDesc, mocker, tmp_path):
     not_filtered_groups = get_pruning_groups(
         nncf_network.nncf.get_graph(), PT_EXPERIMENTAL_PRUNING_OPERATOR_METATYPES, pruning_producing_types, tmp_path
     )
-    nncf_network.nncf.get_graph().visualize_graph('transformers38.dot')
+    nncf_network.nncf.get_graph().visualize_graph("transformers38.dot")
     nx_graph = get_graph_spy.spy_return
     path_to_dot = get_full_path_to_the_graph(f"{str(desc)}.dot", "pruning_groups")
     compare_nx_graph_with_reference(nx_graph, path_to_dot, sort_dot_graph=False)