|
| 1 | +# |
| 2 | +# Copyright 2016 The BigDL Authors. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | +# |
| 16 | +# Some parts of this file is adapted from |
| 17 | +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py |
| 18 | +# which is licensed under Apache License 2.0: |
| 19 | +# |
| 20 | +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. |
| 21 | +# |
| 22 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 23 | +# you may not use this file except in compliance with the License. |
| 24 | +# You may obtain a copy of the License at |
| 25 | +# |
| 26 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 27 | +# |
| 28 | +# Unless required by applicable law or agreed to in writing, software |
| 29 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 30 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 31 | +# See the License for the specific language governing permissions and |
| 32 | +# limitations under the License. |
| 33 | + |
| 34 | + |
| 35 | +import math |
| 36 | +import torch |
| 37 | + |
| 38 | +from typing import Optional |
| 39 | + |
| 40 | + |
| 41 | +def mllama_vision_attention_forward( |
| 42 | + self, |
| 43 | + hidden_state: torch.Tensor, |
| 44 | + attention_mask: Optional[torch.Tensor] = None, |
| 45 | + output_attentions: bool = None, |
| 46 | +): |
| 47 | + query = self.q_proj(hidden_state) |
| 48 | + key = self.k_proj(hidden_state) |
| 49 | + value = self.v_proj(hidden_state) |
| 50 | + |
| 51 | + batch_size, q_seq_len, _ = query.shape |
| 52 | + _, kv_seq_len, _ = key.shape |
| 53 | + |
| 54 | + query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim).transpose(1, 2) |
| 55 | + key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim).transpose(1, 2) |
| 56 | + value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim).transpose(1, 2) |
| 57 | + |
| 58 | + attn_weights = torch.matmul(query, key.transpose(2, 3)) / math.sqrt(self.head_dim) |
| 59 | + |
| 60 | + if attention_mask is not None: # no matter the length, we just slice it |
| 61 | + causal_mask = attention_mask[:, :, :, : key.shape[-2]] |
| 62 | + attn_weights = attn_weights + causal_mask |
| 63 | + |
| 64 | + # upcast attention to fp32 |
| 65 | + from ipex_llm.transformers.models.common import attention_softmax |
| 66 | + attn_weights = attention_softmax(attn_weights, self.training) |
| 67 | + |
| 68 | + attn_output = torch.matmul(attn_weights, value) |
| 69 | + |
| 70 | + attn_output = attn_output.transpose(1, 2).contiguous() |
| 71 | + attn_output = attn_output.reshape(batch_size, q_seq_len, -1) |
| 72 | + |
| 73 | + output = self.o_proj(attn_output) |
| 74 | + |
| 75 | + if not output_attentions: |
| 76 | + attn_weights = None |
| 77 | + |
| 78 | + return output, attn_weights |
0 commit comments