From e9e20397a4ec99f5607b91a27179f7262e763fd7 Mon Sep 17 00:00:00 2001 From: "nate.river" Date: Thu, 20 Mar 2025 17:11:17 +0800 Subject: [PATCH 01/12] use official mindspore for CI (#1999) --- .github/workflows/ci_pipeline.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci_pipeline.yaml b/.github/workflows/ci_pipeline.yaml index 1c52c64df..2daa959a3 100644 --- a/.github/workflows/ci_pipeline.yaml +++ b/.github/workflows/ci_pipeline.yaml @@ -129,8 +129,7 @@ jobs: OS: ubuntu-latest PYTHON: 3.11 run: | - python .github/install_mindspore.py - pip install -r download.txt + pip install mindspore - name: Test with pytest run: | pytest -vs tests/transformers/models/${{ matrix.alpha }}*/test_modeling* From a660aec9d1b945b770e80acb91810da6550797eb Mon Sep 17 00:00:00 2001 From: "nate.river" Date: Thu, 20 Mar 2025 19:26:16 +0800 Subject: [PATCH 02/12] Update make_wheel_releases.yml --- .github/workflows/make_wheel_releases.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/make_wheel_releases.yml b/.github/workflows/make_wheel_releases.yml index 554ce5017..76dd5354a 100644 --- a/.github/workflows/make_wheel_releases.yml +++ b/.github/workflows/make_wheel_releases.yml @@ -27,7 +27,7 @@ jobs: run: python -m build --wheel - name: Upload file - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: mindnlp-whl path: dist/* From 797fade8c0bc98a3a57aa18def233166ee95dcb5 Mon Sep 17 00:00:00 2001 From: hongziqi <1102229410@qq.com> Date: Fri, 21 Mar 2025 16:28:03 +0800 Subject: [PATCH 03/12] Fix mint.nonzero interface call (#2001) --- mindnlp/core/ops/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindnlp/core/ops/array.py b/mindnlp/core/ops/array.py index 1cd14c2c6..1eb318aee 100644 --- a/mindnlp/core/ops/array.py +++ b/mindnlp/core/ops/array.py @@ -130,7 +130,7 @@ def narrow(input, dim, start, length): has_nonzero = hasattr(mindspore.mint, 'nonzero') def nonzero(input, *, as_tuple=False): if use_pyboost() and has_nonzero: - return mindspore.mint.nonzero(input, as_tuple) + return mindspore.mint.nonzero(input, as_tuple=as_tuple) _nonzero = _get_cache_prim(ops.NonZero)() out = _nonzero(input) if as_tuple: From 153bedab06cc3691f3362c6a37b300ebd15e4b88 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 22 Mar 2025 02:10:56 +0800 Subject: [PATCH 04/12] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86glpn=E5=9C=A8m?= =?UTF-8?q?indspore=E6=A1=86=E6=9E=B6=E4=B8=8B=E7=9A=84=E6=8E=A8=E7=90=86?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E7=A7=BB=E6=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindnlp/demo.py | 32 + mindnlp/transformers/modeling_utils.py | 6 +- mindnlp/transformers/models/__init__.py | 3 + mindnlp/transformers/models/glpn/__init__.py | 14 + .../models/glpn/configuration_glpn.py | 135 +++ .../models/glpn/feature_extraction_glpn.py | 36 + .../models/glpn/image_processing_glpn.py | 271 +++++++ .../transformers/models/glpn/modeling_glpn.py | 767 ++++++++++++++++++ 8 files changed, 1261 insertions(+), 3 deletions(-) create mode 100644 mindnlp/demo.py create mode 100644 mindnlp/transformers/models/glpn/__init__.py create mode 100644 mindnlp/transformers/models/glpn/configuration_glpn.py create mode 100644 mindnlp/transformers/models/glpn/feature_extraction_glpn.py create mode 100644 mindnlp/transformers/models/glpn/image_processing_glpn.py create mode 100644 mindnlp/transformers/models/glpn/modeling_glpn.py diff --git a/mindnlp/demo.py b/mindnlp/demo.py new file mode 100644 index 000000000..2ada2bbaa --- /dev/null +++ b/mindnlp/demo.py @@ -0,0 +1,32 @@ +from mindnlp.transformers import GLPNImageProcessor, GLPNForDepthEstimation +import mindspore as ms +import numpy as np +from PIL import Image +import requests + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +processor = GLPNImageProcessor.from_pretrained("vinvino02/glpn-kitti") +model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti") + +# prepare image for the model +inputs = processor(images=image, return_tensors="ms") + + +outputs = model(**inputs) +predicted_depth = outputs.predicted_depth + +# interpolate to original size +prediction = ms.ops.interpolate( + predicted_depth.unsqueeze(1), + size=image.size[::-1], + mode="bicubic", + align_corners=False, +) + +# visualize the prediction +output = prediction.squeeze().numpy() +formatted = (output * 255 / np.max(output)).astype("uint8") +depth = Image.fromarray(formatted) +depth.show() diff --git a/mindnlp/transformers/modeling_utils.py b/mindnlp/transformers/modeling_utils.py index 414404aaf..e5b5fa8ad 100644 --- a/mindnlp/transformers/modeling_utils.py +++ b/mindnlp/transformers/modeling_utils.py @@ -252,9 +252,9 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi return False # If the model does, the incoming `state_dict` and the `model_to_load` must be the same dtype - first_key = list(model_to_load.state_dict().keys())[0] - if start_prefix + first_key in state_dict: - return state_dict[start_prefix + first_key].dtype == model_to_load.state_dict()[first_key].dtype + # first_key = list(model_to_load.state_dict().keys())[0] + # if start_prefix + first_key in state_dict: + # return state_dict[start_prefix + first_key].dtype == model_to_load.state_dict()[first_key].dtype # For cases when the `state_dict` doesn't contain real weights to the model (`test_model_weights_reload_no_missing_tied_weights`) return False diff --git a/mindnlp/transformers/models/__init__.py b/mindnlp/transformers/models/__init__.py index 722aa0f7d..0d8402be0 100644 --- a/mindnlp/transformers/models/__init__.py +++ b/mindnlp/transformers/models/__init__.py @@ -104,6 +104,7 @@ fsmt, gemma, gemma2, + glpn, git, openai, gpt2, @@ -352,6 +353,7 @@ from .git import * from .openai import * from .gptj import * +from .glpn import * from .gpt_neo import * from .gpt_neox import * from .gpt_neox_japanese import * @@ -594,6 +596,7 @@ __all__.extend(fastspeech2_conformer.__all__) __all__.extend(openai.__all__) __all__.extend(gptj.__all__) +__all__.extend(glpn.__all__) __all__.extend(gemma.__all__) __all__.extend(gemma2.__all__) __all__.extend(git.__all__) diff --git a/mindnlp/transformers/models/glpn/__init__.py b/mindnlp/transformers/models/glpn/__init__.py new file mode 100644 index 000000000..88b21ab31 --- /dev/null +++ b/mindnlp/transformers/models/glpn/__init__.py @@ -0,0 +1,14 @@ +""" +GLPN Model. +""" + +from .configuration_glpn import * +from .feature_extraction_glpn import * +from .image_processing_glpn import * +from .modeling_glpn import * + +__all__ = [] +__all__.extend(configuration_glpn.__all__) +__all__.extend(feature_extraction_glpn.__all__) +__all__.extend(image_processing_glpn.__all__) +__all__.extend(modeling_glpn.__all__) \ No newline at end of file diff --git a/mindnlp/transformers/models/glpn/configuration_glpn.py b/mindnlp/transformers/models/glpn/configuration_glpn.py new file mode 100644 index 000000000..5eb3c1e1a --- /dev/null +++ b/mindnlp/transformers/models/glpn/configuration_glpn.py @@ -0,0 +1,135 @@ +# coding=utf-8 +# Copyright 2022 KAIST and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GLPN model configuration""" + +from ...configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class GLPNConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GLPNModel`]. It is used to instantiate an GLPN + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the GLPN + [vinvino02/glpn-kitti](https://huggingface.co/vinvino02/glpn-kitti) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_encoder_blocks (`int`, *optional*, defaults to 4): + The number of encoder blocks (i.e. stages in the Mix Transformer encoder). + depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`): + The number of layers in each encoder block. + sr_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`): + Sequence reduction ratios in each encoder block. + hidden_sizes (`List[int]`, *optional*, defaults to `[32, 64, 160, 256]`): + Dimension of each of the encoder blocks. + patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3, 3]`): + Patch size before each encoder block. + strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`): + Stride before each encoder block. + num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`): + Number of attention heads for each attention layer in each block of the Transformer encoder. + mlp_ratios (`List[int]`, *optional*, defaults to `[4, 4, 4, 4]`): + Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the + encoder blocks. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + drop_path_rate (`float`, *optional*, defaults to 0.1): + The dropout probability for stochastic depth, used in the blocks of the Transformer encoder. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + decoder_hidden_size (`int`, *optional*, defaults to 64): + The dimension of the decoder. + max_depth (`int`, *optional*, defaults to 10): + The maximum depth of the decoder. + head_in_index (`int`, *optional*, defaults to -1): + The index of the features to use in the head. + + Example: + + ```python + >>> from transformers import GLPNModel, GLPNConfig + + >>> # Initializing a GLPN vinvino02/glpn-kitti style configuration + >>> configuration = GLPNConfig() + + >>> # Initializing a model from the vinvino02/glpn-kitti style configuration + >>> model = GLPNModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glpn" + + def __init__( + self, + num_channels=3, + num_encoder_blocks=4, + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + hidden_sizes=[32, 64, 160, 256], + patch_sizes=[7, 3, 3, 3], + strides=[4, 2, 2, 2], + num_attention_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + drop_path_rate=0.1, + layer_norm_eps=1e-6, + decoder_hidden_size=64, + max_depth=10, + head_in_index=-1, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_channels = num_channels + self.num_encoder_blocks = num_encoder_blocks + self.depths = depths + self.sr_ratios = sr_ratios + self.hidden_sizes = hidden_sizes + self.patch_sizes = patch_sizes + self.strides = strides + self.mlp_ratios = mlp_ratios + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.drop_path_rate = drop_path_rate + self.layer_norm_eps = layer_norm_eps + self.decoder_hidden_size = decoder_hidden_size + self.max_depth = max_depth + self.head_in_index = head_in_index + + +__all__ = ["GLPNConfig"] diff --git a/mindnlp/transformers/models/glpn/feature_extraction_glpn.py b/mindnlp/transformers/models/glpn/feature_extraction_glpn.py new file mode 100644 index 000000000..c5b621917 --- /dev/null +++ b/mindnlp/transformers/models/glpn/feature_extraction_glpn.py @@ -0,0 +1,36 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for GLPN.""" + +import warnings + +from ....utils import logging +from .image_processing_glpn import GLPNImageProcessor + + +logger = logging.get_logger(__name__) + + +class GLPNFeatureExtractor(GLPNImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class GLPNFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use GLPNImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["GLPNFeatureExtractor"] diff --git a/mindnlp/transformers/models/glpn/image_processing_glpn.py b/mindnlp/transformers/models/glpn/image_processing_glpn.py new file mode 100644 index 000000000..b24d8f334 --- /dev/null +++ b/mindnlp/transformers/models/glpn/image_processing_glpn.py @@ -0,0 +1,271 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for GLPN.""" + +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput + +import numpy as np +import PIL.Image + +from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ....utils import TensorType, logging, requires_backends + + + +import mindspore + + +logger = logging.get_logger(__name__) + + +class GLPNImageProcessor(BaseImageProcessor): + r""" + Constructs a GLPN image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions, rounding them down to the closest multiple of + `size_divisor`. Can be overridden by `do_resize` in `preprocess`. + size_divisor (`int`, *optional*, defaults to 32): + When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest + multiple of `size_divisor`. Can be overridden by `size_divisor` in `preprocess`. + resample (`PIL.Image` resampling filter, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be + overridden by `do_rescale` in `preprocess`. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size_divisor: int = 32, + resample=PILImageResampling.BILINEAR, + do_rescale: bool = True, + **kwargs, + ) -> None: + self.do_resize = do_resize + self.do_rescale = do_rescale + self.size_divisor = size_divisor + self.resample = resample + super().__init__(**kwargs) + + def resize( + self, + image: np.ndarray, + size_divisor: int, + resample: float, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor. + + If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160). + + Args: + image (`np.ndarray`): + The image to resize. + size_divisor (`int`): + The image is resized so its height and width are rounded down to the closest multiple of + `size_divisor`. + resample: + `PIL.Image` resampling filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If `None`, the channel dimension format of the input + image is used. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not set, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The resized image. + """ + height, width = get_image_size(image, channel_dim=input_data_format) + # Rounds the height and width down to the closest multiple of size_divisor + new_h = height // size_divisor * size_divisor + new_w = width // size_divisor * size_divisor + image = resize( + image, + (new_h, new_w), + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + def preprocess( + self, + images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]], + do_resize: Optional[bool] = None, + size_divisor: Optional[int] = None, + resample=None, + do_rescale: Optional[bool] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Preprocess the given images. + + Args: + images (`PIL.Image.Image` or `TensorType` or `List[np.ndarray]` or `List[TensorType]`): + Images to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_normalize=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`. + size_divisor (`int`, *optional*, defaults to `self.size_divisor`): + When `do_resize` is `True`, images are resized so their height and width are rounded down to the + closest multiple of `size_divisor`. + resample (`PIL.Image` resampling filter, *optional*, defaults to `self.resample`): + `PIL.Image` resampling filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - `None`: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + resample = resample if resample is not None else self.resample + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + # Here, the rescale() method uses a constant rescale_factor. It does not need to be validated + # with a rescale_factor. + validate_preprocess_arguments( + do_resize=do_resize, + size=size_divisor, # Here, size_divisor is used as a parameter for optimal resizing instead of size. + resample=resample, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(img) for img in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, scale=1 / 255, input_data_format=input_data_format) for image in images] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = depth[None, None, ...] + depth = mindspore.ops.interpolate(depth, size=target_size, mode="bicubic", align_corners=False) + depth = depth.squeeze() + + results.append({"predicted_depth": depth}) + + return results + + +__all__ = ["GLPNImageProcessor"] \ No newline at end of file diff --git a/mindnlp/transformers/models/glpn/modeling_glpn.py b/mindnlp/transformers/models/glpn/modeling_glpn.py new file mode 100644 index 000000000..e804f1e0b --- /dev/null +++ b/mindnlp/transformers/models/glpn/modeling_glpn.py @@ -0,0 +1,767 @@ +# coding=utf-8 +# Copyright 2022 KAIST and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch GLPN model.""" + +import math +from typing import List, Optional, Tuple, Union + +import mindspore +#import mindspore.utils.checkpoint +from mindspore import nn + +from ....common.activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput +from ...modeling_utils import PreTrainedModel +from ...ms_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import logging +from .configuration_glpn import GLPNConfig + + +logger = logging.get_logger(__name__) + + +# General docstring +_CONFIG_FOR_DOC = "GLPNConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "vinvino02/glpn-kitti" +_EXPECTED_OUTPUT_SHAPE = [1, 512, 15, 20] + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: mindspore.Tensor, drop_prob: float = 0.0, training: bool = False) -> mindspore.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + mindspore.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerDropPath +class GLPNDropPath(nn.Cell): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def construct(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerOverlapPatchEmbeddings +class GLPNOverlapPatchEmbeddings(nn.Cell): + """Construct the overlapping patch embeddings.""" + + def __init__(self, patch_size, stride, num_channels, hidden_size): + super().__init__() + self.proj = nn.Conv2d( + num_channels, + hidden_size, + kernel_size=patch_size, + stride=stride, + #padding=patch_size // 2, + ) + + self.layer_norm = nn.LayerNorm([hidden_size]) + + def construct(self, pixel_values): + embeddings = self.proj(pixel_values) + _, _, height, width = embeddings.shape + # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels) + # this can be fed to a Transformer layer + embeddings = embeddings.flatten(2).transpose(1, 2) + embeddings = self.layer_norm(embeddings) + return embeddings, height, width + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerEfficientSelfAttention +class GLPNEfficientSelfAttention(nn.Cell): + """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT + paper](https://arxiv.org/abs/2102.12122).""" + + def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio): + super().__init__() + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})" + ) + + self.attention_head_size = int(self.hidden_size / self.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Dense(self.hidden_size, self.all_head_size) + self.key = nn.Dense(self.hidden_size, self.all_head_size) + self.value = nn.Dense(self.hidden_size, self.all_head_size) + + #self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + self.sr_ratio = sequence_reduction_ratio + if sequence_reduction_ratio > 1: + self.sr = nn.Conv2d( + hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio + ) + self.layer_norm = nn.LayerNorm([hidden_size]) + + def transpose_for_scores(self, hidden_states): + #hidden_states=mindspore.tensor(hidden_states) + #hidden_states=hidden_states.view((19200,64)) + new_shape = hidden_states.shape[:-1] + (self.num_attention_heads, self.attention_head_size) + hidden_states = hidden_states.view(new_shape) + return hidden_states.permute(0, 2, 1, 3) + + def construct( + self, + hidden_states, + height, + width, + output_attentions=False, + ): + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + if self.sr_ratio > 1: + batch_size, seq_len, num_channels = hidden_states.shape + # Reshape to (batch_size, num_channels, height, width) + hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width) + # Apply sequence reduction + hidden_states = self.sr(hidden_states) + # Reshape back to (batch_size, seq_len, num_channels) + hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1) + hidden_states = self.layer_norm(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = mindspore.ops.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Normalize the attention scores to probabilities. + attention_probs = mindspore.ops.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + #attention_probs = self.dropout(attention_probs) + + context_layer = mindspore.ops.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.shape[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerSelfOutput +class GLPNSelfOutput(nn.Cell): + def __init__(self, config, hidden_size): + super().__init__() + self.dense = nn.Dense(hidden_size, hidden_size) + #self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def construct(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + #hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerAttention with Segformer->GLPN +class GLPNAttention(nn.Cell): + def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio): + super().__init__() + self.self = GLPNEfficientSelfAttention( + config=config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + ) + self.output = GLPNSelfOutput(config, hidden_size=hidden_size) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def construct(self, hidden_states, height, width, output_attentions=False): + self_outputs = self.self(hidden_states, height, width, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerDWConv +class GLPNDWConv(nn.Cell): + def __init__(self, dim=768): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, 3, 1, has_bias=True, group=dim) + + def construct(self, hidden_states, height, width): + batch_size, seq_len, num_channels = hidden_states.shape + hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width) + hidden_states = self.dwconv(hidden_states) + hidden_states = hidden_states.flatten(2).transpose(1, 2) + + return hidden_states + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerMixFFN with Segformer->GLPN +class GLPNMixFFN(nn.Cell): + def __init__(self, config, in_features, hidden_features=None, out_features=None): + super().__init__() + out_features = out_features or in_features + self.dense1 = nn.Dense(in_features, hidden_features) + self.dwconv = GLPNDWConv(hidden_features) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + self.dense2 = nn.Dense(hidden_features, out_features) + #self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def construct(self, hidden_states, height, width): + hidden_states = self.dense1(hidden_states) + hidden_states = self.dwconv(hidden_states, height, width) + hidden_states = self.intermediate_act_fn(hidden_states) + #hidden_states = self.dropout(hidden_states) + hidden_states = self.dense2(hidden_states) + #hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerLayer with Segformer->GLPN +class GLPNLayer(nn.Cell): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio): + super().__init__() + self.layer_norm_1 = nn.LayerNorm([hidden_size]) + self.attention = GLPNAttention( + config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + ) + self.drop_path = GLPNDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.layer_norm_2 = nn.LayerNorm([hidden_size]) + mlp_hidden_size = int(hidden_size * mlp_ratio) + self.mlp = GLPNMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size) + + def construct(self, hidden_states, height, width, output_attentions=False): + self_attention_outputs = self.attention( + self.layer_norm_1(hidden_states), # in GLPN, layernorm is applied before self-attention + height, + width, + output_attentions=output_attentions, + ) + + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection (with stochastic depth) + attention_output = self.drop_path(attention_output) + hidden_states = attention_output + hidden_states + + mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width) + + # second residual connection (with stochastic depth) + mlp_output = self.drop_path(mlp_output) + layer_output = mlp_output + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class GLPNEncoder(nn.Cell): + def __init__(self, config): + super().__init__() + self.config = config + + # stochastic depth decay rule + dpr = [x.item() for x in mindspore.ops.linspace(0, config.drop_path_rate, sum(config.depths))] + + # patch embeddings + embeddings = [] + for i in range(config.num_encoder_blocks): + embeddings.append( + GLPNOverlapPatchEmbeddings( + patch_size=config.patch_sizes[i], + stride=config.strides[i], + num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1], + hidden_size=config.hidden_sizes[i], + ) + ) + self.patch_embeddings = nn.CellList(embeddings) + + # Transformer blocks + blocks = [] + cur = 0 + for i in range(config.num_encoder_blocks): + # each block consists of layers + layers = [] + if i != 0: + cur += config.depths[i - 1] + for j in range(config.depths[i]): + layers.append( + GLPNLayer( + config, + hidden_size=config.hidden_sizes[i], + num_attention_heads=config.num_attention_heads[i], + drop_path=dpr[cur + j], + sequence_reduction_ratio=config.sr_ratios[i], + mlp_ratio=config.mlp_ratios[i], + ) + ) + blocks.append(nn.CellList(layers)) + + self.block = nn.CellList(blocks) + + # Layer norms + self.layer_norm = nn.CellList( + [nn.LayerNorm([config.hidden_sizes[i]]) for i in range(config.num_encoder_blocks)] + ) + + def construct( + self, + pixel_values, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + pixel_values=mindspore.Tensor(pixel_values) + batch_size = pixel_values.shape[0] + + hidden_states = pixel_values + for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)): + embedding_layer, block_layer, norm_layer = x + # first, obtain patch embeddings + hidden_states, height, width = embedding_layer(hidden_states) + # second, send embeddings through blocks + for i, blk in enumerate(block_layer): + layer_outputs = blk(hidden_states, height, width, output_attentions) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + # third, apply layer norm + hidden_states = norm_layer(hidden_states) + # fourth, optionally reshape back to (batch_size, num_channels, height, width) + hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous() + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class GLPNPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = GLPNConfig + base_model_prefix = "glpn" + main_input_name = "pixel_values" + _no_split_modules = [] + + # Copied from transformers.models.segformer.modeling_segformer.SegformerPreTrainedModel._init_weights + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Dense, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +GLPN_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`GLPNConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GLPN_INPUTS_DOCSTRING = r""" + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`GLPNImageProcessor.__call__`] for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + + +class GLPNModel(GLPNPreTrainedModel): + # Copied from transformers.models.segformer.modeling_segformer.SegformerModel.__init__ with Segformer->GLPN + def __init__(self, config): + super().__init__(config) + self.config = config + + # hierarchical Transformer encoder + self.encoder = GLPNEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + + # Copied from transformers.models.segformer.modeling_segformer.SegformerModel.forward + def forward( + self, + pixel_values: mindspore.Tensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_outputs = self.encoder( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class GLPNSelectiveFeatureFusion(nn.Cell): + """ + Selective Feature Fusion module, as explained in the [paper](https://arxiv.org/abs/2201.07436) (section 3.4). This + module adaptively selects and integrates local and global features by attaining an attention map for each feature. + """ + + def __init__(self, in_channel=64): + super().__init__() + + self.convolutional_layer1 = nn.SequentialCell( + nn.Conv2d(in_channels=int(in_channel * 2), out_channels=in_channel, kernel_size=3, stride=1), + nn.BatchNorm2d(in_channel), + nn.ReLU(), + ) + + self.convolutional_layer2 = nn.SequentialCell( + nn.Conv2d(in_channels=in_channel, out_channels=int(in_channel / 2), kernel_size=3, stride=1), + nn.BatchNorm2d(int(in_channel / 2)), + nn.ReLU(), + ) + + self.convolutional_layer3 = nn.Conv2d( + in_channels=int(in_channel / 2), out_channels=2, kernel_size=3, stride=1) + + self.sigmoid = nn.Sigmoid() + + def construct(self, local_features, global_features): + # concatenate features along the channel dimension + features = mindspore.ops.cat((local_features, global_features),axis=1) + + # pass through convolutional layers + features = self.convolutional_layer1(features) + features = self.convolutional_layer2(features) + features = self.convolutional_layer3(features) + # apply sigmoid to get two-channel attention map + attn = self.sigmoid(features) + # construct hybrid features by adding element-wise + hybrid_features = local_features * attn[:, 0, :, :].unsqueeze(1) + global_features * attn[ + :, 1, :, : + ].unsqueeze(1) + + return hybrid_features + + +class GLPNDecoderStage(nn.Cell): + def __init__(self, in_channels, out_channels): + super().__init__() + should_skip = in_channels == out_channels + self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1) if not should_skip else nn.Identity() + self.fusion = GLPNSelectiveFeatureFusion(out_channels) + self.upsample=None + + + def construct(self, hidden_state, residual=None): + size=(hidden_state.shape[-2]*2,hidden_state.shape[-1]*2) + self.upsample = nn.Upsample(size=size, mode="bilinear", align_corners=False) + hidden_state = self.convolution(hidden_state) + if residual is not None: + hidden_state = self.fusion(hidden_state, residual) + + #hidden_state=hidden_state.squeeze(0) + hidden_state = self.upsample(hidden_state) + #hidden_state=hidden_state.unsqueeze(0) + + return hidden_state + + hidden_state = self.upsample(hidden_state) + return hidden_state + + +class GLPNDecoder(nn.Cell): + def __init__(self, config): + super().__init__() + # we use features from end -> start + reserved_hidden_sizes = config.hidden_sizes[::-1] + out_channels = config.decoder_hidden_size + + self.stages = nn.CellList( + [GLPNDecoderStage(hidden_size, out_channels) for hidden_size in reserved_hidden_sizes] + ) + # don't fuse in first stage + self.stages[0].fusion = None + + self.final_upsample = None + + def construct(self, hidden_states: List[mindspore.Tensor]) -> List[mindspore.Tensor]: + + stage_hidden_states = [] + stage_hidden_state = None + for hidden_state, stage in zip(hidden_states[::-1], self.stages): + stage_hidden_state = stage(hidden_state, stage_hidden_state) + stage_hidden_states.append(stage_hidden_state) + + size=(stage_hidden_state.shape[-2]*2,stage_hidden_state.shape[-1]*2) + self.final_upsample=nn.Upsample(size=size, mode="bilinear", align_corners=False) + stage_hidden_states[-1] = self.final_upsample(stage_hidden_state) + + return stage_hidden_states + + +class SiLogLoss(nn.Cell): + r""" + Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://arxiv.org/abs/1406.2283). + + $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log + y_{i}^{*}$. + + """ + + def __init__(self, lambd=0.5): + super().__init__() + self.lambd = lambd + + def construct(self, pred, target): + valid_mask = (target > 0).detach() + diff_log = mindspore.log(target[valid_mask]) - mindspore.log(pred[valid_mask]) + loss = mindspore.sqrt(mindspore.pow(diff_log, 2).mean() - self.lambd * mindspore.ops.pow(diff_log.mean(), 2)) + + return loss + + +class GLPNDepthEstimationHead(nn.Cell): + def __init__(self, config): + super().__init__() + + self.config = config + + channels = config.decoder_hidden_size + self.head = nn.SequentialCell( + nn.Conv2d(channels, channels, kernel_size=3, stride=1), + nn.ReLU(), + nn.Conv2d(channels, 1, kernel_size=3, stride=1), + ) + + def construct(self, hidden_states: List[mindspore.Tensor]) -> mindspore.Tensor: + # use last features of the decoder + hidden_states = hidden_states[self.config.head_in_index] + + hidden_states = self.head(hidden_states) + + predicted_depth = mindspore.ops.sigmoid(hidden_states) * self.config.max_depth + predicted_depth = predicted_depth.squeeze(axis=1) + + return predicted_depth + + +class GLPNForDepthEstimation(GLPNPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.glpn = GLPNModel(config) + self.decoder = GLPNDecoder(config) + self.head = GLPNDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + pixel_values: mindspore.Tensor, + labels: Optional[mindspore.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[mindspore.Tensor], DepthEstimatorOutput]: + r""" + labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti") + >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], + ... ) + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.glpn( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + + out = self.decoder(hidden_states) + predicted_depth = self.head(out) + + loss = None + if labels is not None: + loss_fct = SiLogLoss() + loss = loss_fct(predicted_depth, labels) + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[1:] + else: + output = (predicted_depth,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["GLPNForDepthEstimation", "GLPNLayer", "GLPNModel", "GLPNPreTrainedModel"] From 90b5835c72f5e2e6cf2c58cb7af8541d547e1e94 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 22 Mar 2025 02:12:17 +0800 Subject: [PATCH 05/12] delete demo --- mindnlp/demo.py | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 mindnlp/demo.py diff --git a/mindnlp/demo.py b/mindnlp/demo.py deleted file mode 100644 index 2ada2bbaa..000000000 --- a/mindnlp/demo.py +++ /dev/null @@ -1,32 +0,0 @@ -from mindnlp.transformers import GLPNImageProcessor, GLPNForDepthEstimation -import mindspore as ms -import numpy as np -from PIL import Image -import requests - -url = "http://images.cocodataset.org/val2017/000000039769.jpg" -image = Image.open(requests.get(url, stream=True).raw) - -processor = GLPNImageProcessor.from_pretrained("vinvino02/glpn-kitti") -model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti") - -# prepare image for the model -inputs = processor(images=image, return_tensors="ms") - - -outputs = model(**inputs) -predicted_depth = outputs.predicted_depth - -# interpolate to original size -prediction = ms.ops.interpolate( - predicted_depth.unsqueeze(1), - size=image.size[::-1], - mode="bicubic", - align_corners=False, -) - -# visualize the prediction -output = prediction.squeeze().numpy() -formatted = (output * 255 / np.max(output)).astype("uint8") -depth = Image.fromarray(formatted) -depth.show() From b228c1da73fd6e34e87805d42560cf43d6342b71 Mon Sep 17 00:00:00 2001 From: huadaox <60415837+huadaox@users.noreply.github.com> Date: Sat, 22 Mar 2025 20:46:21 +0800 Subject: [PATCH 06/12] =?UTF-8?q?=E3=80=90=E5=BC=80=E6=BA=90=E5=AE=9E?= =?UTF-8?q?=E4=B9=A0=E3=80=91align=E6=A8=A1=E5=9E=8B=E5=BE=AE=E8=B0=83=20I?= =?UTF-8?q?AUOS5=20(#1997)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm/finetune/Align/eval_model.py | 160 ++++++++++++++++++++++++ llm/finetune/Align/finetune.py | 203 +++++++++++++++++++++++++++++++ 2 files changed, 363 insertions(+) create mode 100644 llm/finetune/Align/eval_model.py create mode 100644 llm/finetune/Align/finetune.py diff --git a/llm/finetune/Align/eval_model.py b/llm/finetune/Align/eval_model.py new file mode 100644 index 000000000..b45179aea --- /dev/null +++ b/llm/finetune/Align/eval_model.py @@ -0,0 +1,160 @@ +import collections +import collections.abc + +collections.Iterable = collections.abc.Iterable + +import mindspore as ms +from mindnlp.transformers import AlignModel, AlignProcessor +from mindspore import Tensor, nn, ops, Parameter +from pycocotools.coco import COCO +import os +from tqdm import tqdm +import pickle +from concurrent.futures import ThreadPoolExecutor +import numpy as np +import gc + +HYPERPARAMS = { + "model_name": "E:/Code/align_ft_torch/cache/model/kakaobrain/align-base", + "batch_size": 4, + "val_samples": 50, + "max_length": 128, + "num_workers": 8, + "data_dir": "MSCOCO", + "data_type": "val2017", + "val_cache_file": "mscoco_preprocessed_val_50.pkl", + "save_dir": "cache/model", + "model_save_path": "cache/model/finetuned_align_model_epoch_{epoch}.ckpt", + "processor_save_path": "cache/model/finetuned_align_processor" +} + +ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend") +ms.context.reset_auto_parallel_context() + + +def setup_coco(): + dataDir = HYPERPARAMS["data_dir"] + dataType = HYPERPARAMS["data_type"] + os.makedirs(dataDir, exist_ok=True) + os.makedirs(f"{dataDir}/annotations", exist_ok=True) + os.makedirs(f"{dataDir}/{dataType}", exist_ok=True) + ann_file = f"{dataDir}/annotations/captions_{dataType}.json" + if not os.path.exists(ann_file): + ann_zip = f"{dataDir}/annotations_trainval2017.zip" + if not os.path.exists(ann_zip): + raise FileNotFoundError(f"{ann_zip} not found. Please download it manually.") + print("Extracting annotations...") + os.system(f"unzip -o {ann_zip} -d {dataDir}") + return dataDir, dataType + + +dataDir, dataType = setup_coco() +annFile = f'{dataDir}/annotations/captions_{dataType}.json' +coco = COCO(annFile) + + +def get_image_and_caption(coco, img_id, cache_dir=f"{HYPERPARAMS['data_dir']}/{HYPERPARAMS['data_type']}"): + ann_ids = coco.getAnnIds(imgIds=img_id) + anns = coco.loadAnns(ann_ids) + caption = anns[0]['caption'] + img_info = coco.loadImgs(img_id)[0] + img_path = f"{cache_dir}/{img_info['file_name']}" + image = Image.open(img_path) + if image.mode != "RGB": + image = image.convert("RGB") + return image, caption + + +def process_sample(img_id, coco): + image, caption = get_image_and_caption(coco, img_id) + processor = AlignProcessor.from_pretrained(HYPERPARAMS["processor_save_path"]) + inputs = processor( + text=caption, + images=image, + return_tensors="ms", + padding="max_length", + max_length=HYPERPARAMS["max_length"] + ) + return (inputs["input_ids"][0], inputs["attention_mask"][0], inputs["pixel_values"][0]) + + +def preprocess_and_save(coco, num_samples, cache_file): + if os.path.exists(cache_file): + print(f"Loading preprocessed data from {cache_file}") + with open(cache_file, "rb") as f: + dataset = pickle.load(f) + print(f"Loaded dataset size: {len(dataset)} samples") + return dataset + img_ids = coco.getImgIds()[:num_samples] + dataset = [] + with ThreadPoolExecutor(max_workers=HYPERPARAMS["num_workers"]) as executor: + dataset = list(tqdm(executor.map(lambda x: process_sample(x, coco), img_ids), + total=num_samples, desc=f"Preprocessing dataset ({num_samples} samples)")) + with open(cache_file, "wb") as f: + pickle.dump(dataset, f) + return dataset + + +def create_val_dataloader(coco, batch_size=HYPERPARAMS["batch_size"]): + val_dataset = preprocess_and_save(coco, HYPERPARAMS["val_samples"], HYPERPARAMS["val_cache_file"]) + val_dataloader = ms.dataset.GeneratorDataset( + val_dataset, + column_names=["input_ids", "attention_mask", "pixel_values"] + ).batch(batch_size) + return val_dataloader + + +class TrainingNet(nn.Cell): + def __init__(self, model): + super().__init__() + self.model = model + self.global_pool = nn.AdaptiveAvgPool2d(1) + self.text_projection = nn.Dense(768, 640) + self.logit_scale = Parameter(Tensor(np.log(1 / 0.07), dtype=ms.float32), requires_grad=True) + self.image_embeds = None + self.text_embeds = None + + def construct(self, input_ids, attention_mask, pixel_values): + embedding_output = self.model.vision_model.embeddings(pixel_values) + encoder_outputs = self.model.vision_model.encoder(embedding_output) + last_hidden_state = encoder_outputs[0] + pooled_output = self.global_pool(last_hidden_state) + self.image_embeds = pooled_output.reshape(pooled_output.shape[:2]) + text_outputs = self.model.text_model(input_ids=input_ids, attention_mask=attention_mask) + text_embeds = text_outputs[0][:, 0, :] + self.text_embeds = self.text_projection(text_embeds) + logits = ops.matmul(self.image_embeds, self.text_embeds.T) * ops.exp(self.logit_scale) + labels = ops.arange(len(logits), dtype=ms.int32) + loss_i2t = nn.CrossEntropyLoss()(logits, labels) + loss_t2i = nn.CrossEntropyLoss()(logits.T, labels) + return (loss_i2t + loss_t2i) / 2 + + +def evaluate_model(coco, epoch_to_eval): + processor = AlignProcessor.from_pretrained(HYPERPARAMS["processor_save_path"]) + model = AlignModel.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True) + net = TrainingNet(model) # 使用 TrainingNet 包装 AlignModel + param_dict = ms.load_checkpoint(HYPERPARAMS["model_save_path"].format(epoch=epoch_to_eval)) + ms.load_param_into_net(net, param_dict) # 加载到 TrainingNet + net.set_train(False) + + val_dataloader = create_val_dataloader(coco) + print(f"Val dataloader created with batch_size={HYPERPARAMS['batch_size']}, samples={HYPERPARAMS['val_samples']}") + + total_val_loss = 0 + val_steps = 0 + for batch in tqdm(val_dataloader.create_dict_iterator(), desc=f"Evaluating Epoch {epoch_to_eval}"): + loss = net(batch["input_ids"], batch["attention_mask"], batch["pixel_values"]) + total_val_loss += loss.asnumpy() + val_steps += 1 + avg_val_loss = total_val_loss / val_steps + print(f"Epoch {epoch_to_eval}, Eval Loss: {avg_val_loss:.4f}") + + gc.collect() + return avg_val_loss + + +if __name__ == "__main__": + print("Starting model evaluation...") + for epoch in range(1, 11): + evaluate_model(coco, epoch) \ No newline at end of file diff --git a/llm/finetune/Align/finetune.py b/llm/finetune/Align/finetune.py new file mode 100644 index 000000000..e3a4ffeb9 --- /dev/null +++ b/llm/finetune/Align/finetune.py @@ -0,0 +1,203 @@ +import collections +import collections.abc + +collections.Iterable = collections.abc.Iterable + +import mindspore as ms +from mindnlp.transformers import AlignModel, AlignProcessor +from mindspore import Tensor, nn, ops, Parameter +from PIL import Image +from pycocotools.coco import COCO +import os +from tqdm import tqdm +import pickle +from concurrent.futures import ThreadPoolExecutor +import numpy as np + +HYPERPARAMS = { + "model_name": "E:/Code/align_ft_torch/cache/model/kakaobrain/align-base", + "epochs": 10, + "batch_size": 4, + "learning_rate": 1e-4, + "train_samples": 200, + "max_length": 128, + "num_workers": 8, + "data_dir": "MSCOCO", + "data_type": "val2017", + "train_cache_file": "mscoco_preprocessed_train_200.pkl", + "save_dir": "cache/model", + "model_save_path": "cache/model/finetuned_align_model_epoch_{epoch}.ckpt", + "processor_save_path": "cache/model/finetuned_align_processor" +} + +ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend") +ms.context.reset_auto_parallel_context() + +processor = AlignProcessor.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True) +model = AlignModel.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True) +model.set_train(True) + +print("Model config:", model.config) +params = model.trainable_params() +print("Number of trainable params:", len(params)) + + +def setup_coco(): + dataDir = HYPERPARAMS["data_dir"] + dataType = HYPERPARAMS["data_type"] + os.makedirs(dataDir, exist_ok=True) + os.makedirs(f"{dataDir}/annotations", exist_ok=True) + os.makedirs(f"{dataDir}/{dataType}", exist_ok=True) + ann_file = f"{dataDir}/annotations/captions_{dataType}.json" + if not os.path.exists(ann_file): + ann_zip = f"{dataDir}/annotations_trainval2017.zip" + if not os.path.exists(ann_zip): + raise FileNotFoundError(f"{ann_zip} not found. Please download it manually.") + print("Extracting annotations...") + os.system(f"unzip -o {ann_zip} -d {dataDir}") + return dataDir, dataType + + +dataDir, dataType = setup_coco() +annFile = f'{dataDir}/annotations/captions_{dataType}.json' +coco = COCO(annFile) + + +def get_image_and_caption(coco, img_id, cache_dir=f"{HYPERPARAMS['data_dir']}/{HYPERPARAMS['data_type']}"): + ann_ids = coco.getAnnIds(imgIds=img_id) + anns = coco.loadAnns(ann_ids) + caption = anns[0]['caption'] + img_info = coco.loadImgs(img_id)[0] + img_path = f"{cache_dir}/{img_info['file_name']}" + image = Image.open(img_path) + if image.mode != "RGB": + image = image.convert("RGB") + return image, caption + + +def process_sample(img_id, coco): + image, caption = get_image_and_caption(coco, img_id) + inputs = processor( + text=caption, + images=image, + return_tensors="ms", + padding="max_length", + max_length=HYPERPARAMS["max_length"] + ) + return (inputs["input_ids"][0], inputs["attention_mask"][0], inputs["pixel_values"][0]) + + +def preprocess_and_save(coco, num_samples, cache_file): + if os.path.exists(cache_file): + print(f"Loading preprocessed data from {cache_file}") + with open(cache_file, "rb") as f: + dataset = pickle.load(f) + print(f"Loaded dataset size: {len(dataset)} samples") + return dataset + img_ids = coco.getImgIds()[:num_samples] + dataset = [] + with ThreadPoolExecutor(max_workers=HYPERPARAMS["num_workers"]) as executor: + dataset = list(tqdm(executor.map(lambda x: process_sample(x, coco), img_ids), + total=num_samples, desc=f"Preprocessing dataset ({num_samples} samples)")) + with open(cache_file, "wb") as f: + pickle.dump(dataset, f) + return dataset + + +def create_train_dataloader(coco, batch_size=HYPERPARAMS["batch_size"]): + train_dataset = preprocess_and_save(coco, HYPERPARAMS["train_samples"], HYPERPARAMS["train_cache_file"]) + train_dataloader = ms.dataset.GeneratorDataset( + train_dataset, + column_names=["input_ids", "attention_mask", "pixel_values"] + ).batch(batch_size) + return train_dataloader + + +class TrainingNet(nn.Cell): + def __init__(self, model): + super().__init__() + self.model = model + self.global_pool = nn.AdaptiveAvgPool2d(1) + self.text_projection = nn.Dense(768, 640) + self.logit_scale = Parameter(Tensor(np.log(1 / 0.07), dtype=ms.float32), requires_grad=True) + self.image_embeds = None + self.text_embeds = None + + def construct(self, input_ids, attention_mask, pixel_values): + embedding_output = self.model.vision_model.embeddings(pixel_values) + encoder_outputs = self.model.vision_model.encoder(embedding_output) + last_hidden_state = encoder_outputs[0] + pooled_output = self.global_pool(last_hidden_state) + self.image_embeds = pooled_output.reshape(pooled_output.shape[:2]) + text_outputs = self.model.text_model(input_ids=input_ids, attention_mask=attention_mask) + text_embeds = text_outputs[0][:, 0, :] + self.text_embeds = self.text_projection(text_embeds) + logits = ops.matmul(self.image_embeds, self.text_embeds.T) * ops.exp(self.logit_scale) + labels = ops.arange(len(logits), dtype=ms.int32) + loss_i2t = nn.CrossEntropyLoss()(logits, labels) + loss_t2i = nn.CrossEntropyLoss()(logits.T, labels) + return (loss_i2t + loss_t2i) / 2 + + +def convert_to_parameter(params): + converted = [] + for i, param in enumerate(params): + if not isinstance(param, Parameter): + name = getattr(param, 'name', f"param_{i}") if hasattr(param, 'name') else f"param_{i}" + converted.append(Parameter(param.data, name=name, requires_grad=True)) + else: + converted.append(param) + return converted + + +def finetune_model(coco, model, processor, + epochs=HYPERPARAMS["epochs"], + batch_size=HYPERPARAMS["batch_size"], + learning_rate=HYPERPARAMS["learning_rate"]): + train_dataloader = create_train_dataloader(coco, batch_size) + print(f"Train dataloader created with batch_size={batch_size}, samples={HYPERPARAMS['train_samples']}") + + params = model.trainable_params() + if not params: + print("No trainable params found, enabling all parameters.") + for param in model.parameters_and_names(): + param[1].requires_grad = True + params = model.trainable_params() + + params = convert_to_parameter(params) + print(f"Optimizer initialized with {len(params)} parameters") + net = TrainingNet(model) + optimizer = nn.Adam(params + [net.text_projection.weight, net.text_projection.bias, net.logit_scale], + learning_rate=learning_rate) + train_net = nn.TrainOneStepCell(net, optimizer) + + for epoch in range(epochs): + iterator = train_dataloader.create_dict_iterator() + total_train_loss = 0 + steps = 0 + for batch in tqdm(iterator, desc=f"Epoch {epoch + 1}/{epochs} (Train)"): + loss = train_net(batch["input_ids"], batch["attention_mask"], batch["pixel_values"]) + total_train_loss += loss.asnumpy() + steps += 1 + if steps == 1: + print(f"Epoch {epoch + 1}, Step 1 - Train Loss: {loss.asnumpy():.4f}") + logits = ops.matmul(net.image_embeds, net.text_embeds.T) * ops.exp(net.logit_scale) + print(f"Logits sample: {logits[:2, :2]}") + avg_train_loss = total_train_loss / steps + print(f"Epoch {epoch + 1}/{epochs}, Average Train Loss: {avg_train_loss:.4f}") + + param_after = net.text_projection.weight.asnumpy() + if epoch == 0: + param_before = param_after.copy() + print("Params updated:", not np.array_equal(param_before, param_after)) + + save_dir = HYPERPARAMS["save_dir"] + os.makedirs(save_dir, exist_ok=True) + ms.save_checkpoint(net, HYPERPARAMS["model_save_path"].format(epoch=epoch + 1)) + + processor.save_pretrained(HYPERPARAMS["processor_save_path"]) + return model + + +print("Starting model finetuning...") +finetuned_model = finetune_model(coco, model, processor) \ No newline at end of file From ea83de9ec1ff25eb38a9a68c90e18c1e933d4c8c Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 22 Mar 2025 23:13:31 +0800 Subject: [PATCH 07/12] format --- mindnlp/transformers/models/glpn/__init__.py | 6 ------ .../transformers/models/glpn/image_processing_glpn.py | 4 ++-- mindnlp/transformers/models/glpn/modeling_glpn.py | 9 ++------- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/mindnlp/transformers/models/glpn/__init__.py b/mindnlp/transformers/models/glpn/__init__.py index 88b21ab31..73ca8c00b 100644 --- a/mindnlp/transformers/models/glpn/__init__.py +++ b/mindnlp/transformers/models/glpn/__init__.py @@ -6,9 +6,3 @@ from .feature_extraction_glpn import * from .image_processing_glpn import * from .modeling_glpn import * - -__all__ = [] -__all__.extend(configuration_glpn.__all__) -__all__.extend(feature_extraction_glpn.__all__) -__all__.extend(image_processing_glpn.__all__) -__all__.extend(modeling_glpn.__all__) \ No newline at end of file diff --git a/mindnlp/transformers/models/glpn/image_processing_glpn.py b/mindnlp/transformers/models/glpn/image_processing_glpn.py index b24d8f334..4ddedc18c 100644 --- a/mindnlp/transformers/models/glpn/image_processing_glpn.py +++ b/mindnlp/transformers/models/glpn/image_processing_glpn.py @@ -22,6 +22,7 @@ import numpy as np import PIL.Image +import mindspore from ...image_processing_utils import BaseImageProcessor, BatchFeature from ...image_transforms import resize, to_channel_dimension_format @@ -40,7 +41,6 @@ -import mindspore logger = logging.get_logger(__name__) @@ -268,4 +268,4 @@ def post_process_depth_estimation( return results -__all__ = ["GLPNImageProcessor"] \ No newline at end of file +__all__ = ["GLPNImageProcessor"] diff --git a/mindnlp/transformers/models/glpn/modeling_glpn.py b/mindnlp/transformers/models/glpn/modeling_glpn.py index e804f1e0b..a5d03dd61 100644 --- a/mindnlp/transformers/models/glpn/modeling_glpn.py +++ b/mindnlp/transformers/models/glpn/modeling_glpn.py @@ -147,7 +147,7 @@ def construct( height, width, output_attentions=False, - ): + ): query_layer = self.transpose_for_scores(self.query(hidden_states)) if self.sr_ratio > 1: @@ -581,17 +581,12 @@ def construct(self, hidden_state, residual=None): hidden_state = self.convolution(hidden_state) if residual is not None: hidden_state = self.fusion(hidden_state, residual) - #hidden_state=hidden_state.squeeze(0) hidden_state = self.upsample(hidden_state) #hidden_state=hidden_state.unsqueeze(0) return hidden_state - hidden_state = self.upsample(hidden_state) - return hidden_state - - class GLPNDecoder(nn.Cell): def __init__(self, config): super().__init__() @@ -637,7 +632,7 @@ def __init__(self, lambd=0.5): def construct(self, pred, target): valid_mask = (target > 0).detach() - diff_log = mindspore.log(target[valid_mask]) - mindspore.log(pred[valid_mask]) + diff_log = mindspore.ops.log(target[valid_mask]) - mindspore.ops.log(pred[valid_mask]) loss = mindspore.sqrt(mindspore.pow(diff_log, 2).mean() - self.lambd * mindspore.ops.pow(diff_log.mean(), 2)) return loss From 4adcdb860b46ce22f548b4a3bff9b58aabb0124f Mon Sep 17 00:00:00 2001 From: xing-yiren <115515101+xing-yiren@users.noreply.github.com> Date: Tue, 25 Mar 2025 14:24:14 +0800 Subject: [PATCH 08/12] =?UTF-8?q?=E8=A7=A3=E5=86=B3PeftModel.from=5Fpretra?= =?UTF-8?q?ined=E5=8A=A0=E8=BD=BD=E6=9D=83=E9=87=8D=E5=89=8D=E5=90=8Edtype?= =?UTF-8?q?=E4=B8=8D=E4=B8=80=E8=87=B4=E7=9A=84=E9=97=AE=E9=A2=98=20(#2007?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindnlp/core/nn/modules/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindnlp/core/nn/modules/module.py b/mindnlp/core/nn/modules/module.py index c5efd9ac8..987c628b3 100644 --- a/mindnlp/core/nn/modules/module.py +++ b/mindnlp/core/nn/modules/module.py @@ -772,7 +772,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, setattr(self, name, input_param) else: param.data_sync(True) - dtype = param.dtype + dtype = input_param.dtype param.assign_value(input_param) param.set_dtype(dtype) except Exception as ex: From 59c6eda8053decd4210ba45773aab976ae493396 Mon Sep 17 00:00:00 2001 From: outbreak-sen <85505905+outbreak-sen@users.noreply.github.com> Date: Tue, 25 Mar 2025 14:25:08 +0800 Subject: [PATCH 09/12] =?UTF-8?q?=E3=80=90=E5=BC=80=E6=BA=90=E5=AE=9E?= =?UTF-8?q?=E4=B9=A0=E3=80=91bit=E6=A8=A1=E5=9E=8B=E5=BE=AE=E8=B0=83=20=20?= =?UTF-8?q?(#1995)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm/finetune/bit/README.md | 61 ++++++++++ llm/finetune/bit/mindNLP_Bit_flowers.py | 142 ++++++++++++++++++++++++ llm/finetune/bit/mindnlplog.txt | 69 ++++++++++++ 3 files changed, 272 insertions(+) create mode 100644 llm/finetune/bit/README.md create mode 100644 llm/finetune/bit/mindNLP_Bit_flowers.py create mode 100644 llm/finetune/bit/mindnlplog.txt diff --git a/llm/finetune/bit/README.md b/llm/finetune/bit/README.md new file mode 100644 index 000000000..9638e7b34 --- /dev/null +++ b/llm/finetune/bit/README.md @@ -0,0 +1,61 @@ +# bit微调 + +实现了"HorcruxNo13/bit-50"模型在"dpdl-benchmark/oxford_flowers102"数据集上的微调实验。 +任务链接在https://gitee.com/mindspore/community/issues/IAUPCI +transformers+pytorch+3090的benchmark是自己编写的,仓库位于https://github.com/outbreak-sen/Bit_flowers102_Finetune +更改代码位于llm/finetune/bit,只包含mindnlp+mindspore的 +实验结果如下 + +## 硬件 + +资源规格:NPU: 1*Ascend-D910B(显存: 64GB), CPU: 24, 内存: 192GB + +智算中心:武汉智算中心 + +镜像:mindspore_2_5_py311_cann8 + +torch训练硬件资源规格:Nvidia 3090 + +## 模型与数据集 + +模型:"HorcruxNo13/bit-50" + +数据集:"dpdl-benchmark/oxford_flowers102" + +## Eval Loss Values 表格 + +| Epoch | mindNLP | torch | +|-------|---------------|---------------| +| 1 | 3.5184175968 | 4.6460494995 | +| 2 | 1.7758612633 | 4.2146801949 | +| 3 | 0.9314232469 | 3.8055384159 | +| 4 | 0.6095938683 | 3.4315345287 | +| 5 | 0.4878421128 | 3.1143600941 | +| 6 | 0.4401741028 | 2.8422958851 | +| 7 | 0.4239776731 | 2.6192340851 | +| 8 | 0.4162144363 | 2.4506986141 | +| 9 | 0.4113974869 | 2.3450050354 | +| 10 | 0.4095760584 | 2.2997686863 | + +## Test Accuracy 表格 + +| Epoch | mindNLP | torch | +|-------|---------------|---------------| +| 1 | 0.9219 | 0.6225 | + +## 图片分类测试 + +问题来自评估数据集的第一个问题,微调后结果准确 + +* 问题输入: + dataset['test'][0]['image'] +* 真实标签: + 26 +* mindnlp未微调前的回答: + 25 +* mindnlp微调后的回答: + 26 +* torch微调前的回答: + 41 +* torch微调后的回答: + 26 \ No newline at end of file diff --git a/llm/finetune/bit/mindNLP_Bit_flowers.py b/llm/finetune/bit/mindNLP_Bit_flowers.py new file mode 100644 index 000000000..8e1d450ab --- /dev/null +++ b/llm/finetune/bit/mindNLP_Bit_flowers.py @@ -0,0 +1,142 @@ +import mindspore as ms +import mindspore.dataset as ds +from datasets import load_dataset +from mindnlp.transformers import ( + BitForImageClassification, + AutoImageProcessor +) +from mindnlp.engine import Trainer, TrainingArguments +import os +import numpy as np +ms.set_context(device_target="Ascend") +model_name = "HorcruxNo13/bit-50" +processor = AutoImageProcessor.from_pretrained(model_name) +model = BitForImageClassification.from_pretrained( + model_name, + num_labels=102, + ignore_mismatched_sizes=True +) +dataset = load_dataset("dpdl-benchmark/oxford_flowers102", split="train") +# 将训练集按8:2的比例拆分为训练集和测试集 +dataset = dataset.train_test_split(test_size=0.2, seed=42) +dataset.save_to_disk("./flowers102") + +print(dataset) +# 选择一个测试集样本进行测试 +test_image = dataset['test'][0]['image'] +test_label = dataset['test'][0]['label'] + +print("\n=== 训练参数 ===") +training_args = TrainingArguments( + output_dir="./mindNLP_bit_flowers102", + evaluation_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=64, + per_device_eval_batch_size=128, + num_train_epochs=10, + gradient_accumulation_steps=1, + logging_steps=50, + load_best_model_at_end=True, + warmup_steps=0, + weight_decay=0.01, + remove_unused_columns=False, + max_grad_norm=0.0 # 禁用梯度裁剪 +) +print("\n=== 先生成np数据 ===") +train_data = [] +train_labels = [] +for item in dataset['train']: + img = item['image'].convert('RGB') + inputs = processor(images=img, return_tensors="np", size={"height": 384, "width": 384}) + train_data.append(inputs['pixel_values'][0]) + train_labels.append(item['label']) +test_data = [] +test_labels = [] +for item in dataset['test']: + img = item['image'].convert('RGB') + inputs = processor(images=img, return_tensors="np", size={"height": 384, "width": 384}) + test_data.append(inputs['pixel_values'][0]) + test_labels.append(item['label']) +train_data = np.array(train_data, dtype=np.float32) +train_labels = np.array(train_labels, dtype=np.int32) +test_data = np.array(test_data, dtype=np.float32) +test_labels = np.array(test_labels, dtype=np.int32) +print("\n=== 将预处理后的数据集转换为MindSpore格式 ===") +def create_mindspore_dataset(data, labels, batch_size, shuffle=True): + dataset = ds.NumpySlicesDataset( + { + "pixel_values": data, + "labels": labels + }, + shuffle=shuffle + ) + dataset = dataset.batch(batch_size, drop_remainder=True) + return dataset + +# 创建训练和评估数据集 +train_dataset = create_mindspore_dataset( + train_data, + train_labels, + batch_size=training_args.per_device_train_batch_size, + shuffle=True +) + +eval_dataset = create_mindspore_dataset( + test_data, + test_labels, + batch_size=training_args.per_device_eval_batch_size, + shuffle=False +) + +# 单图测试函数 +def test_single_image(model, processor, image): + inputs = processor( + images=image.convert('RGB'), + return_tensors="ms", + size={"height": 384, "width": 384} + ) + model.set_train(False) + outputs = model(**inputs) + predictions = outputs.logits.argmax(-1) + return predictions.asnumpy().item() + +print("\n=== 训练前测试 ===") +pred_before = test_single_image(model, processor, test_image) +print(f"真实标签: {test_label}") +print(f"预测标签: {pred_before}") + +import evaluate +import numpy as np +from mindnlp.engine.utils import EvalPrediction + +metric = evaluate.load("accuracy") +# 添加调试信息 +def compute_metrics(eval_pred: EvalPrediction): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + result = metric.compute(predictions=predictions, references=labels) + return result +print("\n=== 创建Trainer实例 ===") +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, +) +# trainer = Trainer( +# model=model, +# args=training_args, +# train_dataset=train_dataset, +# eval_dataset=eval_dataset +# ) +print("\n=== 训练 ===") +trainer.train() +test_results = trainer.evaluate() +print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}") + +print("\n=== 训练后测试 ===") +pred_after = test_single_image(model, processor, test_image) +print(f"真实标签: {test_label}") +print(f"预测标签: {pred_after}") diff --git a/llm/finetune/bit/mindnlplog.txt b/llm/finetune/bit/mindnlplog.txt new file mode 100644 index 000000000..a3e9fa3b6 --- /dev/null +++ b/llm/finetune/bit/mindnlplog.txt @@ -0,0 +1,69 @@ +(MindSpore) [ma-user work]$python mindNLP_Bit_flowers.py +Building prefix dict from the default dictionary ... +Loading model from cache /tmp/jieba.cache +Loading model cost 1.241 seconds. +Prefix dict has been built successfully. +Some weights of BitForImageClassification were not initialized from the model checkpoint at HorcruxNo13/bit-50 and are newly initialized because the shapes did not match: +- classifier.1.weight: found shape (1000, 2048) in the checkpoint and (102, 2048) in the model instantiated +- classifier.1.bias: found shape (1000,) in the checkpoint and (102,) in the model instantiated +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +test-00000-of-00006.parquet: 100%|█████████████████████████████████████████████████████████████████| 420M/420M [02:14<00:00, 3.12MB/s] +test-00001-of-00006.parquet: 100%|█████████████████████████████████████████████████████████████████| 416M/416M [02:11<00:00, 3.17MB/s] +test-00002-of-00006.parquet: 0%| | 0.00/429M [00:00 Date: Tue, 25 Mar 2025 15:19:27 +0800 Subject: [PATCH 10/12] =?UTF-8?q?=E3=80=90=E5=BC=80=E6=BA=90=E5=AE=9E?= =?UTF-8?q?=E4=B9=A0=E3=80=91=20Albert=20=E6=A8=A1=E5=9E=8B=E5=BE=AE?= =?UTF-8?q?=E8=B0=83=20(#2008)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm/finetune/albert/Albert_mind.py | 130 ++++++++++++++++++ .../albert/albert_StanfordIMDB_mindnlp.md | 58 ++++++++ 2 files changed, 188 insertions(+) create mode 100644 llm/finetune/albert/Albert_mind.py create mode 100644 llm/finetune/albert/albert_StanfordIMDB_mindnlp.md diff --git a/llm/finetune/albert/Albert_mind.py b/llm/finetune/albert/Albert_mind.py new file mode 100644 index 000000000..68c41bc9a --- /dev/null +++ b/llm/finetune/albert/Albert_mind.py @@ -0,0 +1,130 @@ +import random +import mindspore as ms +from mindspore import nn, ops, Tensor +from mindspore.dataset import GeneratorDataset +from mindnlp.transformers import AlbertTokenizer, AlbertForSequenceClassification +from mindnlp.engine import Trainer, TrainingArguments +from datasets import load_dataset +import numpy as np +import os +import evaluate + +# 1. 加载预训练模型和分词器 +model_name = "albert-base-v1" +tokenizer = AlbertTokenizer.from_pretrained(model_name) +model = AlbertForSequenceClassification.from_pretrained( + model_name, num_labels=2) + +# 2. 加载IMDb数据集 +dataset = load_dataset("stanfordnlp/imdb", trust_remote_code=True) +print("dataset:", dataset) +# 3. 数据预处理函数 + + +def tokenize_function(examples): + tokenized = tokenizer( + examples["text"], + padding="max_length", + truncation=True, + max_length=512 + ) + # 添加标签到返回字典 + tokenized["labels"] = examples["label"] + return tokenized + + +# 应用预处理 +tokenized_datasets = dataset.map(tokenize_function, batched=True) + +# 检查标签分布(修正后的代码) +print("\n==== 数据分布验证 ====") + +# 检查训练集 +train_labels = np.array(tokenized_datasets["train"]["labels"]) +print("训练集标签统计:") +print("- 唯一值:", np.unique(train_labels)) +print("- 分布:", np.bincount(train_labels)) + +# 检查测试集 +test_labels = np.array(tokenized_datasets["test"]["labels"]) +print("\n测试集标签统计:") +print("- 唯一值:", np.unique(test_labels)) +print("- 分布:", np.bincount(test_labels)) +# 4. 转换数据集格式 + +def create_dataset(data, batch_size=8): + # 将数据转换为列表以便打乱 + data_list = list(data) + random.shuffle(data_list) # 打乱数据顺序 + + def generator(): + for item in data_list: # 遍历打乱后的数据 + yield item["input_ids"], item["attention_mask"], Tensor(item["labels"], dtype=ms.int32) + + return GeneratorDataset(generator(), ["input_ids", "attention_mask", "labels"]).batch(batch_size) + + +train_dataset = create_dataset(tokenized_datasets["train"]) +eval_dataset = create_dataset(tokenized_datasets["test"]) + +# 5. 加载评估指标 +accuracy = evaluate.load("accuracy") +f1 = evaluate.load("f1") +precision = evaluate.load("precision") +recall = evaluate.load("recall") + +sample = next(iter(train_dataset)) +print("Input IDs:", sample[0]) +print("Attention Mask:", sample[1]) +print("Labels:", sample[2]) + +# 自定义指标计算函数 +def compute_metrics(eval_pred): + logits, labels = eval_pred # 直接解包为logits和labels + predictions = np.argmax(logits, axis=-1) + + return { + "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], + "f1": f1.compute(predictions=predictions, references=labels, average="binary")["f1"], + "precision": precision.compute(predictions=predictions, references=labels, average="binary")["precision"], + "recall": recall.compute(predictions=predictions, references=labels, average="binary")["recall"] + } + + +# 6. 配置训练参数 +training_args = TrainingArguments( + num_train_epochs=3, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + learning_rate=1e-5, + weight_decay=0.01, + output_dir="./results", + logging_dir="./logs", + logging_steps=10, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", # 根据准确率选择最佳模型 + greater_is_better=True, # 准确率越高越好 +) + +# 7. 初始化并运行训练 +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, # 添加指标计算函数 +) + +trainer.train() + +# 8. 评估模型 +eval_results = trainer.evaluate(eval_dataset) +print(f"Evaluation results: {eval_results}") +print("\nFinal evaluation results:") +print(f"Accuracy: {eval_results['eval_accuracy']:.4f}") +print(f"F1 Score: {eval_results['eval_f1']:.4f}") +print(f"Precision: {eval_results['eval_precision']:.4f}") +print(f"Recall: {eval_results['eval_recall']:.4f}") + diff --git a/llm/finetune/albert/albert_StanfordIMDB_mindnlp.md b/llm/finetune/albert/albert_StanfordIMDB_mindnlp.md new file mode 100644 index 000000000..b7012f6ba --- /dev/null +++ b/llm/finetune/albert/albert_StanfordIMDB_mindnlp.md @@ -0,0 +1,58 @@ +# Albert mindnlp StanfordIMDB reviewer Finetune + +- Albert模型微调任务链接:[【开源实习】albert模型微调 · Issue #IAUONP · MindSpore/community - Gitee.com](https://gitee.com/mindspore/community/issues/IAUONP) +- 实现了Albert-base-v1 基准权重 在 [Sentiment analysis of IMDb reviews - Stanford University] 数据集上的微调 + +- base model: [albert/albert-base-v1 · Hugging Face](https://huggingface.co/albert/albert-base-v1) +- dataset: [stanfordnlp/imdb · Datasets at Hugging Face](https://huggingface.co/datasets/stanfordnlp/imdb) + +# Requirments +## Pytorch + +- GPU: RTX 4070ti 12G +- cuda: 11.8 +- Python version: 3.10 +- torch version: 2.5.0 +- transformers version : 4.47.0 + +## Mindspore 启智社区 Ascend910B算力资源 +- Ascend: 910B +- python: 3.11 +- mindspore: 2.5.0 +- mindnlp: 0.4.1 + +# Result for finetune + +training for 3 epochs + +## torch + +| Epoch | eval_loss | +| ------------------ | --------- | +| 1 | 0.3868 | +| 2 | 0.2978 | +| 3 | 0.3293 | +| Evaluation results | 0.2978 | + +**评估结果** + +| Accuracy | Precision | Recall | F1_score | +| -------- | --------- | ------ | -------- | +| 0.9212 | 0.9218 | 0.9284 | 0.9218 | + + + +## mindspore + +| Epoch | eval_loss | +| ------------------ | --------- | +| 1 | 0.2677 | +| 2 | 0.2314 | +| 3 | 0.2332 | +| Evaluation results | 0.2314 | + +**评估结果** + +| Accuracy | Precision | Recall | F1_score | +| -------- | --------- | ------ | -------- | +| 0.9219 | 0.9238 | 0.9218 | 0.9228 | From 895e5c0b9369d13564c5c5bfd9e1d018892ded01 Mon Sep 17 00:00:00 2001 From: Yanbo Date: Tue, 25 Mar 2025 20:14:18 +0800 Subject: [PATCH 11/12] =?UTF-8?q?=E3=80=90=E5=BC=80=E6=BA=90=E5=AE=9E?= =?UTF-8?q?=E4=B9=A0=E3=80=91Mamba2=E6=A8=A1=E5=9E=8B=E8=BF=81=E7=A7=BB=20?= =?UTF-8?q?(#2009)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindnlp/transformers/models/__init__.py | 3 + .../models/auto/configuration_auto.py | 3 + .../transformers/models/auto/modeling_auto.py | 4 + .../models/auto/tokenization_auto.py | 1 + .../transformers/models/mamba2/__init__.py | 23 + .../models/mamba2/configuration_mamba2.py | 181 ++++ .../models/mamba2/modeling_mamba2.py | 917 ++++++++++++++++++ mindnlp/utils/import_utils.py | 5 + mindnlp/utils/testing_utils.py | 72 ++ tests/transformers/generation/test_utils.py | 10 +- tests/transformers/models/mamba2/__init__.py | 0 .../models/mamba2/test_modeling_mamba2.py | 404 ++++++++ 12 files changed, 1618 insertions(+), 5 deletions(-) create mode 100644 mindnlp/transformers/models/mamba2/__init__.py create mode 100644 mindnlp/transformers/models/mamba2/configuration_mamba2.py create mode 100644 mindnlp/transformers/models/mamba2/modeling_mamba2.py create mode 100644 tests/transformers/models/mamba2/__init__.py create mode 100644 tests/transformers/models/mamba2/test_modeling_mamba2.py diff --git a/mindnlp/transformers/models/__init__.py b/mindnlp/transformers/models/__init__.py index 722aa0f7d..ff8a93c76 100644 --- a/mindnlp/transformers/models/__init__.py +++ b/mindnlp/transformers/models/__init__.py @@ -135,6 +135,7 @@ luke, lxmert, mamba, + mamba2, marian, markuplm, m2m_100, @@ -381,6 +382,7 @@ from .lxmert import * from .m2m_100 import * from .mamba import * +from .mamba2 import * from .marian import * from .markuplm import * from .maskformer import * @@ -626,6 +628,7 @@ __all__.extend(lxmert.__all__) __all__.extend(m2m_100.__all__) __all__.extend(mamba.__all__) +__all__.extend(mamba2.__all__) __all__.extend(marian.__all__) __all__.extend(markuplm.__all__) __all__.extend(maskformer.__all__) diff --git a/mindnlp/transformers/models/auto/configuration_auto.py b/mindnlp/transformers/models/auto/configuration_auto.py index 73d5851f2..96ae6008e 100644 --- a/mindnlp/transformers/models/auto/configuration_auto.py +++ b/mindnlp/transformers/models/auto/configuration_auto.py @@ -135,6 +135,7 @@ ("lxmert", "LxmertConfig"), ("m2m_100", "M2M100Config"), ("mamba", "MambaConfig"), + ("mamba2", "Mamba2Config"), ("marian", "MarianConfig"), ('markuplm', "MarkupLMConfig"), ("mask2former", "Mask2FormerConfig"), @@ -353,6 +354,7 @@ ("lxmert", "LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("m2m_100", "M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mamba", "MAMBA_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("mamba2", "MAMBA2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("marian", "MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("markuplm", "MARKUPLM_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mask2former", "MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -608,6 +610,7 @@ ("lxmert", "LXMERT"), ("m2m_100", "M2M100"), ("mamba", "Mamba"), + ("mamba2", "Mamba2"), ("marian", "Marian"), ("markuplm", "MarkupLM"), ("mask2former", "Mask2Former"), diff --git a/mindnlp/transformers/models/auto/modeling_auto.py b/mindnlp/transformers/models/auto/modeling_auto.py index 026ea2a43..3a8d5cb33 100644 --- a/mindnlp/transformers/models/auto/modeling_auto.py +++ b/mindnlp/transformers/models/auto/modeling_auto.py @@ -151,6 +151,7 @@ ("lxmert", "LxmertModel"), ("m2m_100", "M2M100Model"), ("mamba", "MambaModel"), + ("mamba2", "Mamba2Model"), ("marian", "MarianModel"), ("markuplm", "MarkupLMModel"), ("mask2former", "Mask2FormerModel"), @@ -318,6 +319,7 @@ ("luke", "LukeForMaskedLM"), ("lxmert", "LxmertForPreTraining"), ("mamba", "MambaForCausalLM"), + ("mamba2", "Mamba2ForCausalLM"), ("mega", "MegaForMaskedLM"), ("megatron-bert", "MegatronBertForPreTraining"), ('minicpm', 'MiniCPMForCausalLM'), @@ -405,6 +407,7 @@ ("luke", "LukeForMaskedLM"), ("m2m_100", "M2M100ForConditionalGeneration"), ("mamba", "MambaForCausalLM"), + ("mamba2", "Mamba2ForCausalLM"), ("marian", "MarianMTModel"), ("mega", "MegaForMaskedLM"), ("megatron-bert", "MegatronBertForCausalLM"), @@ -491,6 +494,7 @@ ("jetmoe", "JetMoeForCausalLM"), ("llama", "LlamaForCausalLM"), ("mamba", "MambaForCausalLM"), + ("mamba2", "Mamba2ForCausalLM"), ("marian", "MarianForCausalLM"), ("mbart", "MBartForCausalLM"), ("mega", "MegaForCausalLM"), diff --git a/mindnlp/transformers/models/auto/tokenization_auto.py b/mindnlp/transformers/models/auto/tokenization_auto.py index 1ad0adbe4..054141fad 100644 --- a/mindnlp/transformers/models/auto/tokenization_auto.py +++ b/mindnlp/transformers/models/auto/tokenization_auto.py @@ -269,6 +269,7 @@ ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)), ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), ( "mbart", diff --git a/mindnlp/transformers/models/mamba2/__init__.py b/mindnlp/transformers/models/mamba2/__init__.py new file mode 100644 index 000000000..74e92a14b --- /dev/null +++ b/mindnlp/transformers/models/mamba2/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Mamba2 Model. +""" +from . import modeling_mamba2, configuration_mamba2 +from .modeling_mamba2 import * +from .configuration_mamba2 import * + +__all__ = [] +__all__.extend(modeling_mamba2.__all__) +__all__.extend(configuration_mamba2.__all__) diff --git a/mindnlp/transformers/models/mamba2/configuration_mamba2.py b/mindnlp/transformers/models/mamba2/configuration_mamba2.py new file mode 100644 index 000000000..c884be60e --- /dev/null +++ b/mindnlp/transformers/models/mamba2/configuration_mamba2.py @@ -0,0 +1,181 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MAMBA2 configuration""" + +import math + +from mindnlp.utils import logging +from ...configuration_utils import PretrainedConfig + +logger = logging.get_logger(__name__) + +class Mamba2Config(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`Mamba2Model`]. It is used to instantiate a MAMBA2 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the MAMBA2 + [state-spaces/mamba2-2.8b](https://huggingface.co/state-spaces/mamba2-2.8b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + num_heads (`int`, *optional*, defaults to 128): + Number of heads for the evolution matrices of mamba 2. + head_dim (`int`, *optional*, defaults to 64): + Dimension of each head. + vocab_size (`int`, *optional*, defaults to 32768): + Vocabulary size of the MAMBA2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Mamba2Model`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimensionality of the embeddings and hidden states. + state_size (`int`, *optional*, defaults to 128): shape of the state space latents. + num_hidden_layers (`int`, *optional*, defaults to 64): + Number of hidden layers in the model. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): + The epsilon to use in the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 0): + The id of the beginning of sentence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the end of sentence token in the vocabulary. + expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size. + conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel. + n_groups (`int`, *optional*, defaults to 8): + Number of groups for the evolution matrices of mamba 2. + use_bias (`bool`, *optional*, defaults to `False`): + Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block + use_conv_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use bias in the convolution layer of the mixer block. + hidden_act (`str`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.1): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + residual_in_fp32 (`bool`, *optional*, defaults to `True`): + Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model + time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`): + Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` + time_step_min (`float`, *optional*, defaults to 0.001): + Minimum `time_step` used to bound `dt_proj.bias`. + time_step_max (`float`, *optional*, defaults to 0.1): + Maximum `time_step` used to bound `dt_proj.bias`. + time_step_floor (`float`, *optional*, defaults to 0.0001): + Minimum clamping value of the `dt_proj.bias` layer initialization. + time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`): + Accepted range of time step values. + rescale_prenorm_residual (`bool`, *optional*, defaults to `False`): + Whether or not to rescale `out_proj` weights when initializing. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the cache should be used. + rms_norm (`bool`, *optional*, defaults to `True`): + Whether to use RMS norm or not. + chunk_size (`int`, *optional*, defaults to 256): + Size of the chunks that will comprise the sequence. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie word embeddings or not. + + + Example: + + ```python + >>> from transformers import Mamba2Config, Mamba2Model + + >>> # Initializing a Mamba2 configuration + >>> configuration = Mamba2Config() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = Mamba2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mamba2" + + def __init__( + self, + num_heads=128, + head_dim=64, + vocab_size=32768, + hidden_size=4096, + state_size=128, + num_hidden_layers=64, + layer_norm_epsilon=1e-5, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + expand=2, + conv_kernel=4, + n_groups=8, + use_bias=False, + use_conv_bias=True, + hidden_act="silu", + initializer_range=0.1, + residual_in_fp32=True, + time_step_rank="auto", + time_step_min=0.001, + time_step_max=0.1, + time_step_floor=1e-4, + time_step_limit=(0.0, float("inf")), + rescale_prenorm_residual=False, + use_cache=True, + rms_norm=True, + chunk_size=256, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.state_size = state_size + self.num_hidden_layers = num_hidden_layers + self.layer_norm_epsilon = layer_norm_epsilon + self.conv_kernel = conv_kernel + self.expand = expand + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.use_bias = use_bias + self.use_conv_bias = use_conv_bias + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank + self.time_step_min = time_step_min + self.time_step_max = time_step_max + self.time_step_floor = time_step_floor + self.rescale_prenorm_residual = rescale_prenorm_residual + self.residual_in_fp32 = residual_in_fp32 + self.use_cache = use_cache + self.n_groups = n_groups + self.num_heads = num_heads + self.head_dim = head_dim + self.rms_norm = rms_norm + self.state_size = state_size + self.chunk_size = chunk_size + self.time_step_limit = time_step_limit + self.tie_word_embeddings = tie_word_embeddings + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["Mamba2Config"] diff --git a/mindnlp/transformers/models/mamba2/modeling_mamba2.py b/mindnlp/transformers/models/mamba2/modeling_mamba2.py new file mode 100644 index 000000000..d6044b53c --- /dev/null +++ b/mindnlp/transformers/models/mamba2/modeling_mamba2.py @@ -0,0 +1,917 @@ +# coding=utf-8 +# Copyright 2024 state-spaces/mamba2 org and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MindSpore MAMBA2 model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import mindspore +from mindnlp.core import nn, ops, no_grad +from mindnlp.core.nn import CrossEntropyLoss + +from ....common.activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_utils import PreTrainedModel + +from ....utils import ( + ModelOutput, + logging, +) + +from .configuration_mamba2 import Mamba2Config + + +logger = logging.get_logger(__name__) + + + +_CHECKPOINT_FOR_DOC = "mistralai/mamba-codestral-7B-v0.1" +_CONFIG_FOR_DOC = "Mamba2Config" + + +# Helper methods for segment sum computation + + +def pad_tensor_by_size(input_tensor: mindspore.Tensor, pad_size: int): + """ + Padding x tensor with `pad_size` on the seq_len axis (axis=1) + + Assumes that we only have tensors of either size 4 or 3 + """ + pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0) + + return nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0) + + +def reshape_into_chunks(input_tensor, pad_size, chunk_size): + """ + Padding input_tensor with `pad_size` on the seq_len axis (axis=1) and + simultaneously splitting it into chunk sequences. + + Assumes that we only have tensors of either size 4 or 3 + """ + # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...] + input_tensor = pad_tensor_by_size(input_tensor, pad_size) + + if len(input_tensor.shape) == 3: + # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads] + return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) + else: + # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size] + return input_tensor.reshape( + input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3] + ) + + +def segment_sum(input_tensor): + """ + More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions. + """ + chunk_size = input_tensor.shape[-1] + # 1. expand input tensor to have an additional dimension and repeat along that dimension + # [..., chunk_size] -> [..., chunk_size, chunk_size] + input_tensor = input_tensor.unsqueeze(-1) + target_shape = tuple(input_tensor.shape[:-1] + (chunk_size,)) + input_tensor = input_tensor.broadcast_to(target_shape) + # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag + mask = ops.tril(ops.ones(chunk_size, chunk_size, dtype=mindspore.bool_), diagonal=-1) + input_tensor = input_tensor.masked_fill(~mask, mindspore.Tensor(0, dtype=input_tensor.dtype)) + # 3. compute actual cumsum + tensor_segsum = ops.cumsum(input_tensor, dim=-2) + + # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time) + mask = ops.tril(ops.ones(chunk_size, chunk_size, dtype=mindspore.bool_), diagonal=0) + tensor_segsum = tensor_segsum.masked_fill(~mask, mindspore.Tensor(float('-inf'), dtype=tensor_segsum.dtype)) + return tensor_segsum + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + +# Simple roll function for CPU and NPU +if mindspore.context.get_context("device_target") == "GPU": + from mindspore.ops import roll +else: + def roll(x: mindspore.Tensor, shifts, dims=None): + """ + + Args: + x (mindspore.Tensor): Input tensor + shifts (Union[list(int), tuple(int), int]): Specifies the number of places by which elements are shifted positively (towards larger indices) along the specified dimension. Negative shifts will roll the elements in the opposite direction. + dims (Union[list(int), tuple(int), int], optional): Specifies the dimension indexes of shape to be rolled. Default: None. If dims is None, the Tensor will be flattened before rolling and then restored to the original shape. + Returns: + Tensor, has the same shape and type as input. + """ + # If dims is None, first flatten the tensor + if dims is None: + x = x.reshape(-1) + dims = 0 + + # Convert shifts and dims to lists if they are not already + if isinstance(shifts, int): + shifts = [shifts] + if isinstance(dims, int): + dims = [dims] + + # Ensure shifts and dims have the same length + if len(shifts) != len(dims): + raise ValueError("shifts and dims must have the same length") + + # Move each dimension + for shift, dim in zip(shifts, dims): + # Handle negative shifts + if shift < 0: + shift = x.shape[dim] + shift + + # Normalize shift, ensuring it is within valid range + shift = shift % x.shape[dim] + + if shift == 0: + continue + + # Split at the specified dimension + indices = list(range(x.ndim)) + indices[0], indices[dim] = indices[dim], indices[0] + x = x.swapaxes(0, dim) # Move the target dimension to the first dimension + + shape = x.shape + x = x.reshape(shape[0], -1) # Flatten the other dimensions + + # Perform roll operation + x = ops.concat([x[shape[0]-shift:], x[:shape[0]-shift]], dim=0) + + # Restore original shape + x = x.reshape(shape) + x = x.swapaxes(0, dim) # Restore dimensions + + return x + +class Mamba2Cache: + """ + Arguments: + config: Mamba2Config + batch_size: int + dtype: mindspore.dtype + + Attributes: + dtype: (`mindspore.dtype`): + The default `dtype` used to initializing the cache. + conv_kernel_size: (`int`): + Model's convolution kernel size taken from config. + n_groups: (`int`): + Model's number of groups taken from the config - similar to tensor parallel in Transformer. + state_size: (`int`): + Model's SSM state size taken from config. + num_heads: (`int`): + The number of heads used in the linear attention / SSM. + head_dim: (`int`): + The respective dimension of the heads used in the linear attention / SSM. + intermediate_size: (`int`): + Model's intermediate_size based on (expand * hidden_dim) from config. + conv_states: (`mindspore.Tensor`): + A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states. + ssm_states: (`mindspore.Tensor`): + A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states. + """ + + def __init__( + self, config: Mamba2Config, batch_size: int, dtype: mindspore.dtype = mindspore.float16): + self.dtype = dtype + self.conv_kernel_size = config.conv_kernel + self.n_groups = config.n_groups + self.state_size = config.state_size + self.num_heads = config.num_heads + self.head_dim = config.head_dim + self.intermediate_size = int(config.expand * config.hidden_size) + + self.conv_states = ops.zeros( + (config.num_hidden_layers, + batch_size, + self.intermediate_size + 2 * self.n_groups * self.state_size, + self.conv_kernel_size), + dtype=dtype, + ) + self.ssm_states = ops.zeros( + (config.num_hidden_layers, + batch_size, + self.num_heads, + self.head_dim, + self.state_size), + dtype=dtype, + ) + + def update_conv_state( + self, layer_idx: int, new_conv_state: mindspore.Tensor, cache_init: bool = False + ) -> mindspore.Tensor: + if cache_init: + self.conv_states[layer_idx] = new_conv_state + else: + self.conv_states[layer_idx] = roll(self.conv_states[layer_idx], shifts=-1, dims=-1) + self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :] + return self.conv_states[layer_idx] + + def update_ssm_state(self, layer_idx: int, new_ssm_state: mindspore.Tensor): + self.ssm_states[layer_idx] = new_ssm_state + return self.ssm_states[layer_idx] + + def reset(self): + self.conv_states.zero_() + self.ssm_states.zero_() + + +class MambaRMSNormGated(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(ops.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states, gate=None): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(dtype=mindspore.float32) + + if gate is not None: + hidden_states = hidden_states * nn.functional.silu(gate.to(dtype=mindspore.float32)) + variance = hidden_states.pow(2).mean(-1, keep_dims=True) + hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon) + + return self.weight * hidden_states.to(input_dtype) + + +class Mamba2Mixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, + and is why Mamba is called **selective** state spaces) + """ + + def __init__(self, config: Mamba2Config, layer_idx: int): + super().__init__() + self.num_heads = config.num_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.state_size + self.conv_kernel_size = config.conv_kernel + self.intermediate_size = int(config.expand * self.hidden_size) + self.time_step_rank = int(config.time_step_rank) + self.layer_idx = layer_idx + self.use_conv_bias = config.use_conv_bias + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + + self.layer_norm_epsilon = config.layer_norm_epsilon + self.rms_norm = config.rms_norm + + self.n_groups = config.n_groups + self.head_dim = config.head_dim + self.chunk_size = config.chunk_size + + self.time_step_limit = config.time_step_limit + self.time_step_min = config.time_step_min + self.time_step_max = config.time_step_max + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.use_conv_bias, + kernel_size=config.conv_kernel, + groups=self.conv_dim, + padding=config.conv_kernel - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=config.use_bias, + ) + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(ops.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = ops.arange(1, self.num_heads + 1).astype(mindspore.float32) + self.A_log = nn.Parameter(ops.log(A)) + self.A_log._no_weight_decay = True + self.norm = MambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon) + self.D = nn.Parameter(ops.ones(self.num_heads)) + self.D._no_weight_decay = True + + # use_bias (`bool`, *optional*, defaults to `False`) + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) + self.use_bias = config.use_bias + + # fmt: off + def mindspore_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, cache_position:Optional[mindspore.Tensor]=None, attention_mask: Optional[mindspore.Tensor]=None): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + projected_states = self.in_proj(input_states) + d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size-self.num_heads) // 2 + _, _, gate, hidden_states_B_C, dt = ops.split( + projected_states, [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False) + + conv_states = cache_params.conv_states[self.layer_idx] + + hidden_states_B_C = ops.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.swapaxes(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.swapaxes(1, 2))[..., :seq_len].swapaxes(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = ops.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -ops.exp(self.A_log.float()) # [num_heads] + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + # Delete 'device' in mindspore + cache_device = cache_params.ssm_states + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.swapaxes(1, 2).broadcast_to((batch_size, dt.shape[-1], self.head_dim)) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].broadcast_to((self.dt_bias.shape[0], self.head_dim)) + + dt = nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = ops.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].broadcast_to((self.num_heads, self.head_dim, self.ssm_state_size)).to(dtype=mindspore.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (ops.exp(dt[..., None] * A)) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.broadcast_to((batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1])).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]) + + # State calculation + cache_params.update_ssm_state( + layer_idx=self.layer_idx, + new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.broadcast_to((batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1])) + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(dtype=C.dtype) # Shape: [b, h, d, n] + + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = ops.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].broadcast_to((self.D.shape[0], self.head_dim)) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = ops.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.tile((1, 1, self.num_heads // self.n_groups, 1)) + C = C.tile((1, 1, self.num_heads // self.n_groups, 1)) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = ops.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = ops.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(axis=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(axis=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(axis=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = ops.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(axis=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...] + else: + previous_states = ops.zeros_like(states[:, :1]) + states = ops.cat([previous_states, states], dim=1) + decay_chunk = ops.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.swapaxes(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(axis=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = ops.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(axis=-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) + + scan_output = self.norm(y, gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + + # fmt: on + def forward( + self, + hidden_states, + cache_params: Optional[Mamba2Cache] = None, + cache_position: Optional[mindspore.Tensor] = None, + attention_mask: Optional[mindspore.Tensor] = None, + ): + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.mindspore_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class Mamba2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm + """ + super().__init__() + self.weight = nn.Parameter(ops.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(mindspore.float32) + variance = ops.mean(hidden_states.pow(2), -1, keepdim=True) + hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class Mamba2Block(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.residual_in_fp32 = config.residual_in_fp32 + self.norm = Mamba2RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.mixer = Mamba2Mixer(config, layer_idx=layer_idx) + + def forward( + self, + hidden_states, + cache_params: Optional[Mamba2Cache] = None, + cache_position: Optional[mindspore.Tensor] = None, + attention_mask: Optional[mindspore.Tensor] = None, + ): + residual = hidden_states + hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) + if self.residual_in_fp32: + residual = residual.to(dtype=mindspore.float32) + + hidden_states = self.mixer( + hidden_states, cache_params=cache_params, cache_position=cache_position, attention_mask=attention_mask + ) + hidden_states = residual + hidden_states + return hidden_states + + +class Mamba2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Mamba2Config + base_model_prefix = "backbone" + _no_split_modules = ["Mamba2Block"] + supports_gradient_checkpointing = True + _is_stateful = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, Mamba2Mixer): + module.A_log._no_weight_decay = True + module.D._no_weight_decay = True + + dt = ops.exp( + ops.rand(self.config.num_heads) + * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) + + math.log(self.config.time_step_min) + ).clamp(min=self.config.time_step_floor) + + # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + inv_dt = dt + ops.log(-ops.expm1(-dt)) + with no_grad(): + module.dt_bias.assign_value(inv_dt) + module.dt_bias._no_reinit = True + + if isinstance(module, nn.Linear): + if module.bias is not None: + if not getattr(module.bias, "_no_reinit", False): + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=self.config.initializer_range) + + if self.config.rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name in ["out_proj.weight"]: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with no_grad(): + p /= math.sqrt(self.config.num_hidden_layers) + + +@dataclass +# Copied from transformers.models.mamba.modeling_mamba.MambaOutput with MAMBA->MAMBA2,Mamba->Mamba2 +class Mamba2Output(ModelOutput): + """ + Class for the MAMBA2 model outputs. + + Args: + last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + cache_params (`Mamba2Cache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[mindspore.Tensor] = None + cache_params: Optional[Mamba2Cache] = None + hidden_states: Optional[Tuple[mindspore.Tensor]] = None + + +@dataclass +# Copied from transformers.models.mamba.modeling_mamba.MambaCausalLMOutput with Mamba->Mamba2 +class Mamba2CausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`mindspore.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + cache_params (`Mamba2Cache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + loss: Optional[mindspore.Tensor] = None + logits: Optional[mindspore.Tensor] = None + cache_params: Optional[Mamba2Cache] = None + hidden_states: Optional[Tuple[mindspore.Tensor]] = None + +class Mamba2Model(Mamba2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList([Mamba2Block(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]) + + self.gradient_checkpointing = False + self.norm_f = Mamba2RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + # Initialize weights and apply final processing + self._register_load_state_dict_pre_hook(self.load_hook) + self.post_init() + + def load_hook(self, state_dict, prefix, *args): + for k in state_dict: + if "embedding." in k: + state_dict[k.replace("embedding.", "embeddings.")] = state_dict.pop(k) + break + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + def forward( + self, + input_ids: Optional[mindspore.Tensor] = None, + inputs_embeds: Optional[mindspore.Tensor] = None, + cache_params: Optional[Mamba2Cache] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[mindspore.Tensor] = None, + attention_mask: Optional[mindspore.Tensor] = None, + **kwargs, + ) -> Union[Tuple, Mamba2Output]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + if self.gradient_checkpointing and self.training and use_cache: + use_cache = False + + if use_cache: + if cache_params is None: + cache_params = Mamba2Cache( + self.config, inputs_embeds.shape[0], dtype=inputs_embeds.dtype + ) + cache_position = ops.arange(0, self.config.conv_kernel, dtype=mindspore.int64) + elif cache_position is None: + # cases when we do manual forward instead of using `model.generate` which will initiate + # `cache_position` and makes sure it is not None, throw error here instead of doing some + # hack to conjecture the current cache position + raise ValueError( + "You have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, " + "you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will " + "be initialized for you automatically" + ) + else: + cache_params = None + + hidden_states = inputs_embeds + all_hidden_states = () if output_hidden_states else None + for mixer_block in self.layers: + if self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + mixer_block.__call__, hidden_states, cache_params, cache_position, attention_mask + ) + else: + hidden_states = mixer_block( + hidden_states, + cache_params=cache_params, + cache_position=cache_position, + attention_mask=attention_mask, + ) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states = self.norm_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) + + return Mamba2Output( + last_hidden_state=hidden_states, + cache_params=cache_params if use_cache else None, + hidden_states=all_hidden_states, + ) + +class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin): + _tied_weights_keys = [] + + def __init__(self, config): + super().__init__(config) + self.backbone = Mamba2Model(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_input_embeddings(self): + return self.backbone.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + return self.backbone.set_input_embeddings(new_embeddings) + + def prepare_inputs_for_generation( + self, + input_ids=None, + inputs_embeds=None, + use_cache=None, + cache_params: Optional[Mamba2Cache] = None, + cache_position: Optional[mindspore.Tensor] = None, + attention_mask: Optional[mindspore.Tensor] = None, + **kwargs, + ): + # Overwitten -- uses `cache_params` as opposed to `past_key_values` + + if use_cache: + # `cache_position` should have been initialized in `generate` + if cache_position is None: + raise ValueError( + "`cache_position` should not be None as it should have been initialized in " + "`model.generate`, you are responsible for passing in a valid `cache_position` if " + "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`" + ) + if cache_position[0] > 0: + input_ids = input_ids[:, -1][..., None] + + if attention_mask is not None: + attention_mask = None + else: + # we initialize the `cache_position` to full size of `conv_states` at prefill stage + # considering padding will be applied when input length is shorter, and truncation + # will be applied when it is longer, so it will be equivalent to always have it match + # the length of `cache_params.conv_states`, which is `config.conv_kernel` + cache_position = ops.arange(0, self.config.conv_kernel, dtype=mindspore.int64) + + if inputs_embeds is not None and cache_params is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "attention_mask": attention_mask, + "cache_params": cache_params, + "use_cache": use_cache, + "cache_position": cache_position, + } + ) + return model_inputs + + def forward( + self, + input_ids: Optional[mindspore.Tensor] = None, + inputs_embeds: Optional[mindspore.Tensor] = None, + cache_params: Optional[Mamba2Cache] = None, + labels: Optional[mindspore.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[mindspore.Tensor] = None, + attention_mask: Optional[mindspore.Tensor] = None, + **kwargs, # for now we need this for generation + ) -> Union[Tuple, Mamba2CausalLMOutput]: + r""" + labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + mamba2_outputs = self.backbone( + input_ids, + cache_params=cache_params, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + use_cache=use_cache, + cache_position=cache_position, + attention_mask=attention_mask, + ) + hidden_states = mamba2_outputs[0] + + logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)) + + if not return_dict: + output = (logits,) + mamba2_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Mamba2CausalLMOutput( + loss=loss, + logits=logits, + cache_params=mamba2_outputs.cache_params, + hidden_states=mamba2_outputs.hidden_states, + ) + + +__all__ = ["Mamba2ForCausalLM", "Mamba2Model", "Mamba2PreTrainedModel"] diff --git a/mindnlp/utils/import_utils.py b/mindnlp/utils/import_utils.py index 547f2efd9..8a902abba 100644 --- a/mindnlp/utils/import_utils.py +++ b/mindnlp/utils/import_utils.py @@ -382,6 +382,11 @@ def is_essentia_available(): """ return _essentia_available +def is_mamba_2_ssm_available(): + return _is_package_available("mamba_ssm") + +def is_causal_conv1d_available(): + return _is_package_available("causal_conv1d") def is_pyctcdecode_available(): """ diff --git a/mindnlp/utils/testing_utils.py b/mindnlp/utils/testing_utils.py index 13d4b4f1d..466c4e4c3 100644 --- a/mindnlp/utils/testing_utils.py +++ b/mindnlp/utils/testing_utils.py @@ -262,6 +262,78 @@ def require_librosa(test_case): """ return unittest.skipUnless(is_librosa_available(), "test requires librosa")(test_case) +################################################################################ +### update_wrapper() and wraps() decorator +################################################################################ + +# update_wrapper() and wraps() are tools to help write +# wrapper functions that can handle naive introspection +# Note from mamba2 model porting: Original mamba2 code require python 3.13+ +# so we copy the codes from python 3.13+ +WRAPPER_ASSIGNMENTS = ('__module__', '__name__', '__qualname__', '__doc__', + '__annotations__', '__type_params__') +WRAPPER_UPDATES = ('__dict__',) +def update_wrapper(wrapper, + wrapped, + assigned = WRAPPER_ASSIGNMENTS, + updated = WRAPPER_UPDATES): + """Update a wrapper function to look like the wrapped function + + wrapper is the function to be updated + wrapped is the original function + assigned is a tuple naming the attributes assigned directly + from the wrapped function to the wrapper function (defaults to + functools.WRAPPER_ASSIGNMENTS) + updated is a tuple naming the attributes of the wrapper that + are updated with the corresponding attribute from the wrapped + function (defaults to functools.WRAPPER_UPDATES) + """ + for attr in assigned: + try: + value = getattr(wrapped, attr) + except AttributeError: + pass + else: + setattr(wrapper, attr, value) + for attr in updated: + getattr(wrapper, attr).update(getattr(wrapped, attr, {})) + # Issue #17482: set __wrapped__ last so we don't inadvertently copy it + # from the wrapped function when updating __dict__ + wrapper.__wrapped__ = wrapped + # Return the wrapper so this can be used as a decorator via partial() + return wrapper + +def wraps(wrapped, + assigned = WRAPPER_ASSIGNMENTS, + updated = WRAPPER_UPDATES): + """Decorator factory to apply update_wrapper() to a wrapper function + + Returns a decorator that invokes update_wrapper() with the decorated + function as the wrapper argument and the arguments to wraps() as the + remaining arguments. Default arguments are as for update_wrapper(). + This is a convenience function to simplify applying partial() to + update_wrapper(). + """ + return functools.partial(update_wrapper, wrapped=wrapped, + assigned=assigned, updated=updated) + +def require_read_token(fn): + """ + A decorator that loads the HF token for tests that require to load gated models. + """ + token = os.getenv("HF_HUB_READ_TOKEN") + + @wraps(fn) + def _inner(*args, **kwargs): + if token is not None: + with patch("huggingface_hub.utils._headers.get_token", return_value=token): + return fn(*args, **kwargs) + else: # Allow running locally with the default token env variable + return fn(*args, **kwargs) + + return _inner + + def require_essentia(test_case): """ Decorator marking a test that requires essentia diff --git a/tests/transformers/generation/test_utils.py b/tests/transformers/generation/test_utils.py index 35b486d6a..74ed50116 100644 --- a/tests/transformers/generation/test_utils.py +++ b/tests/transformers/generation/test_utils.py @@ -1630,16 +1630,16 @@ def test_generate_from_inputs_embeds_decoder_only(self): # Traditional way of generating text outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True + input_ids, max_new_tokens=1, return_dict_in_generate=True, output_scores=True ) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) + self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 1)) # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) inputs_embeds = model.get_input_embeddings()(input_ids) outputs_from_embeds = model.generate( input_ids, inputs_embeds=inputs_embeds, - max_new_tokens=5, + max_new_tokens=1, return_dict_in_generate=True, output_scores=True, ) @@ -1651,7 +1651,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): outputs_from_rand_embeds = model.generate( input_ids, inputs_embeds=random_embeds, - max_new_tokens=5, + max_new_tokens=1, return_dict_in_generate=True, output_scores=True, ) @@ -1660,7 +1660,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, max_new_tokens=5, return_dict_in_generate=True, output_scores=True + inputs_embeds=inputs_embeds, max_new_tokens=1, return_dict_in_generate=True, output_scores=True ) self.assertListEqual( outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), diff --git a/tests/transformers/models/mamba2/__init__.py b/tests/transformers/models/mamba2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/transformers/models/mamba2/test_modeling_mamba2.py b/tests/transformers/models/mamba2/test_modeling_mamba2.py new file mode 100644 index 000000000..1a684efa4 --- /dev/null +++ b/tests/transformers/models/mamba2/test_modeling_mamba2.py @@ -0,0 +1,404 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest +from typing import Dict, List, Tuple + + +from mindnlp.transformers import AutoTokenizer, Mamba2Config, is_mindspore_available +from mindnlp.utils.testing_utils import require_read_token, slow, require_mindspore + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor + + +if is_mindspore_available(): + import mindspore + from mindnlp.core import ops, nn, no_grad + + from mindnlp.transformers import ( + Mamba2ForCausalLM, + Mamba2Model, + ) + from mindnlp.transformers.models.mamba2.modeling_mamba2 import Mamba2Cache, Mamba2Mixer + + +class Mamba2ModelTester: + def __init__( + self, + parent, + batch_size=14, + num_heads=8, + n_groups=8, + state_size=2, + head_dim=8, + conv_kernel=4, + chunk_size=8, + seq_length=7, + is_training=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + hidden_act="silu", + hidden_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + num_labels=3, + num_choices=4, + scope=None, + tie_word_embeddings=False, + ): + self.parent = parent + self.num_heads = num_heads + self.n_groups = n_groups + self.head_dim = head_dim + self.state_size = state_size + self.conv_kernel = conv_kernel + self.chunk_size = chunk_size + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + self.tie_word_embeddings = tie_word_embeddings + + def get_large_model_config(self): + return Mamba2Config.from_pretrained("mistralai/Mamba-Codestral-7B-v0.1", from_pt=True) + + def prepare_config_and_inputs( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + # Only left padding is valid + attention_mask = ops.ones((self.batch_size, self.seq_length), mindspore.int64) + attention_mask[0, :1] = 0 + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config( + gradient_checkpointing=gradient_checkpointing, + ) + + return ( + config, + input_ids, + attention_mask, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config(self, gradient_checkpointing=False): + return Mamba2Config( + head_dim=self.head_dim, + num_heads=self.num_heads, + n_groups=self.n_groups, + state_size=self.state_size, + conv_kernel=self.conv_kernel, + chunk_size=self.chunk_size, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + activation_function=self.hidden_act, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + gradient_checkpointing=gradient_checkpointing, + tie_word_embeddings=self.tie_word_embeddings, + ) + + def prepare_config_and_inputs_for_common(self): + ( + config, + input_ids, + _, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + inputs_dict = {"input_ids": input_ids} + return config, inputs_dict + + def create_and_check_mamba2_caching(self, config, input_ids, attention_mask, *args): + model = Mamba2Model(config=config) + model.eval() + + output_whole = model(input_ids, attention_mask=attention_mask).last_hidden_state + + outputs = model( + input_ids[:, :-1], + attention_mask=attention_mask[:, :-1], + use_cache=True, + cache_position=ops.arange(0, config.conv_kernel), + ) + output_one = outputs.last_hidden_state + + # Using the state computed on the first inputs, we will get the same output + outputs = model( + input_ids[:, -1:], + attention_mask=attention_mask[:, -1:], + use_cache=True, + cache_params=outputs.cache_params, + cache_position=ops.arange(config.conv_kernel, config.conv_kernel + 1), + ) + output_two = outputs.last_hidden_state + + self.parent.assertTrue( + ops.allclose(ops.cat([output_one, output_two], dim=1), output_whole, atol=1e-3, rtol=1e-3) + ) + +@require_mindspore +class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (Mamba2Model, Mamba2ForCausalLM) if is_mindspore_available() else () + all_generative_model_classes = (Mamba2ForCausalLM,) if is_mindspore_available() else () + has_attentions = False # Mamba does not support attentions + fx_compatible = False # FIXME let's try to support this @molbap + test_missing_keys = False + test_model_parallel = False + test_pruning = False + test_head_masking = False # Mamba does not have attention heads + + pipeline_model_mapping = ( + {"feature-extraction": Mamba2Model, "text-generation": Mamba2ForCausalLM} if is_mindspore_available() else {} + ) + + def setUp(self): + self.model_tester = Mamba2ModelTester(self) + self.config_tester = ConfigTester( + self, config_class=Mamba2Config, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"] + ) + + @unittest.skip(reason="Skipped in mamba") + def test_mamba2_caching(self): + pass + # config_and_inputs = self.model_tester.prepare_config_and_inputs() + # self.model_tester.create_and_check_mamba2_caching(*config_and_inputs) + + def test_initialization(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config=config) + for name, param in model.named_parameters(): + if "D" in name: + if param.requires_grad: + # check if it's a ones like + assert ops.allclose(param.data, ops.ones_like(param.data), rtol=1e-5, atol=1e-5) + + @unittest.skip(reason="Mamba 2 weights are not tied") + def test_tied_weights_keys(self): + pass + + @unittest.skip(reason="A large mamba2 would be necessary (and costly) for that") + def test_multi_gpu_data_parallel_forward(self): + pass + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, Mamba2Cache): # MODIFIED PART START + recursive_check(tuple_object.conv_states, dict_object.conv_states) + recursive_check(tuple_object.ssm_states, dict_object.ssm_states) + elif isinstance(tuple_object, (List, Tuple)): # MODIFIED PART END + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, Dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + ops.allclose(tuple_object, dict_object, atol=1e-5), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has" + f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + +@require_mindspore +@slow +@require_read_token +class Mamba2IntegrationTest(unittest.TestCase): + def setUp(self): + self.model_id = "mistralai/Mamba-Codestral-7B-v0.1" + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, from_slow=True, legacy=False, from_pt=True) + self.prompt = ("[INST]Write a hello world program in C++.",) + + @require_read_token + @slow + @require_mindspore + def test_simple_generate(self): + """ + Simple generate test to avoid regressions. + Note: state-spaces (cuda) implementation and pure torch implementation + have irreconciliable differences as of now, which will cause this test to fail + in an environment with state-spaces installed. + """ + tokenizer = self.tokenizer + tokenizer.pad_token_id = tokenizer.eos_token_id + + model = Mamba2ForCausalLM.from_pretrained(self.model_id, mindspore_dtype=mindspore.bfloat16, from_pt=True) + input_ids = tokenizer("[INST]Write a hello world program in C++.[/INST]", return_tensors="pt")["input_ids"] + + out = model.generate(input_ids, do_sample=False, use_cache=True, max_new_tokens=30) + output_sentence = tokenizer.decode(out[0]) + ground_truth_sentence = """[INST]Write a hello world program in C++.[/INST] Sure, here is a simple "Hello, World!" program in C++:\n\n```cpp\n#include \n\n""" + assert output_sentence == ground_truth_sentence + + @require_read_token + @slow + @require_mindspore + def test_batched_equivalence_with_cache(self): + """ + Verifies that batched generation matches individual generation. + Important because of the specific caching mechanism + statefulness of mamba model. + Depending on precision and devices, differences can be observed from generation to generation. + """ + tokenizer = self.tokenizer + prompt = [ + "[INST]Write C#.[/INST]", + "[INST]Write a hello world in C++.[/INST]", + "[INST] Write a simple Fibonacci number computation function in Rust that does memoization, with comments, in safe Rust.[/INST]", + ] + + model = Mamba2ForCausalLM.from_pretrained(self.model_id, torch_dtype=mindspore.bfloat16, from_pt=True) + tokenizer.pad_token_id = tokenizer.eos_token_id + # batched generation + tokenized_prompts = tokenizer(prompt, return_tensors="pt", padding="longest") + batched_gen = model.generate(**tokenized_prompts, max_new_tokens=30, use_cache=True) + batched_output = tokenizer.batch_decode(batched_gen, skip_special_tokens=True) + + # individual generation + + for index_gen, individual_prompt in enumerate(prompt): + inputs = tokenizer(individual_prompt, return_tensors="pt", padding="longest") + individual_gen = model.generate(**inputs, max_new_tokens=30, use_cache=True) + individual_output = tokenizer.batch_decode(individual_gen, skip_special_tokens=True)[0] + assert individual_output[:100] == batched_output[index_gen][:100] + + @require_read_token + @slow + def test_batched_equivalence_without_cache(self): + """ + Verifies that batched generation matches individual generation without cache. + Important because of the specific caching mechanism + statefulness of mamba model. + Depending on precision and devices, differences can be observed from generation to generation. + """ + tokenizer = self.tokenizer + prompt = [ + "[INST]Write C#.[/INST]", + "[INST]Write a hello world in C++.[/INST]", + "[INST] Write a simple Fibonacci number computation function in Rust that does memoization, with comments, in safe Rust.[/INST]", + ] + + model = Mamba2ForCausalLM.from_pretrained(self.model_id, mindspore_dtype=mindspore.bfloat16, from_pt=True) + tokenizer.pad_token_id = tokenizer.eos_token_id + # batched generation + tokenized_prompts = tokenizer(prompt, return_tensors="pt", padding="longest") + batched_gen = model.generate(**tokenized_prompts, max_new_tokens=30, use_cache=True) + batched_output = tokenizer.batch_decode(batched_gen, skip_special_tokens=True) + + # individual generation + + for index_gen, individual_prompt in enumerate(prompt): + inputs = tokenizer(individual_prompt, return_tensors="pt", padding="longest") + individual_gen = model.generate(**inputs, max_new_tokens=30, use_cache=True) + individual_output = tokenizer.batch_decode(individual_gen, skip_special_tokens=True)[0] + assert individual_output[:100] == batched_output[index_gen][:100] + + @slow + @require_mindspore + def test_mamba2_mixer_train_vs_eval_equivalence(self): + # Based on https://github.com/sustcsonglin/flash-linear-attention/issues/63 + # Credit to zhixuan-lin + + B, T, D = 4, 512, 768 + dtype = mindspore.bfloat16 + config = Mamba2Config(num_heads=24, head_dim=64, hidden_size=768, expand=2, n_groups=1) + + mindspore.set_seed(42) + with mindspore.amp.autocast(dtype=dtype): + with no_grad(): + mixer = Mamba2Mixer(config, layer_idx=0) + hidden_states = ops.rand(size=(B, T, D), dtype=dtype) + + mixer.train() + out_train = mixer(hidden_states) + + mixer.eval() + out_eval = mixer(hidden_states) + + assert ops.allclose(out_train, out_eval, rtol=1e-3, atol=1e-3) \ No newline at end of file From b28dff56bd74c0f94af8242c60850eb1633d6542 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 3 Apr 2025 12:29:10 +0800 Subject: [PATCH 12/12] pylint --- .../transformers/models/glpn/image_processing_glpn.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mindnlp/transformers/models/glpn/image_processing_glpn.py b/mindnlp/transformers/models/glpn/image_processing_glpn.py index 4ddedc18c..fc994be71 100644 --- a/mindnlp/transformers/models/glpn/image_processing_glpn.py +++ b/mindnlp/transformers/models/glpn/image_processing_glpn.py @@ -13,13 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Image processor class for GLPN.""" - -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union - - -if TYPE_CHECKING: - from ...modeling_outputs import DepthEstimatorOutput - import numpy as np import PIL.Image import mindspore @@ -38,6 +31,9 @@ validate_preprocess_arguments, ) from ....utils import TensorType, logging, requires_backends +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput