remove meta Tensor warning when dispatch meta operator (#2246)

lvyufeng · web-flow · commit c0bd08ad2ebc · 2025-11-20T16:59:15.000+08:00
diff --git a/examples/transformers/inference/deepseek-ocr/image_ocr.jpg b/examples/transformers/inference/deepseek-ocr/image_ocr.jpg
diff --git a/examples/transformers/inference/deepseek-ocr/run_dpsk_ocr.py b/examples/transformers/inference/deepseek-ocr/run_dpsk_ocr.py
@@ -2,7 +2,7 @@
 import mindnlp
 from transformers import AutoModel, AutoTokenizer
 
-model_name = 'lvyufeng/DeepSeek-OCR-Community-Latest'
+model_name = 'lvyufeng/DeepSeek-OCR'
 
 
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
diff --git a/mindnlp/transformers/__init__.py b/mindnlp/transformers/__init__.py
@@ -10,7 +10,7 @@
 from .masking_utils import create_causal_mask, create_sliding_window_causal_mask, create_masks_for_generate
 from .modeling_utils import construct_pipeline_parallel_model, _load_pretrained_model_wrapper, \
     _get_resolved_checkpoint_files_wrapper
-from .cache_utils import dynamic_layer_update
+from .cache_utils import dynamic_layer_update, dynamic_sliding_window_layer_update
 from .tokenization_utils import apply_chat_template_wrapper
 from .trainer import training_step
 from ..utils.decorators import dtype_wrapper, patch_dtype_wrapper, patch_wrappers
@@ -70,5 +70,6 @@ def empty_fn(*args, **kwargs):
 transformers.trainer.Trainer.training_step = training_step
 
 transformers.cache_utils.DynamicLayer.update = dynamic_layer_update
+transformers.cache_utils.DynamicSlidingWindowLayer.update = dynamic_sliding_window_layer_update
 # add mindnlp.transformers modules/attrs to lazymodule
 # setattr(sys.modules[__name__], 'test_ms_model', test_ms_model)
diff --git a/mindnlp/transformers/cache_utils.py b/mindnlp/transformers/cache_utils.py
@@ -27,3 +27,39 @@ def dynamic_layer_update(
         self.keys = mindtorch.cat([self.keys, key_states], dim=-2)
         self.values = mindtorch.cat([self.values, value_states], dim=-2)
     return self.keys, self.values
+
+def dynamic_sliding_window_layer_update(
+        self,
+        key_states: mindtorch.Tensor,
+        value_states: mindtorch.Tensor,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[mindtorch.Tensor, mindtorch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+            cache_kwargs (`dict[str, Any]`, *optional*): Additional arguments for the cache.
+
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states)
+            full_key_states = key_states
+            full_value_states = value_states
+        else:
+            # Compute the full states
+            full_key_states = mindtorch.cat([self.keys, key_states], dim=-2)
+            full_value_states = mindtorch.cat([self.values, value_states], dim=-2)
+
+        self.cumulative_length += key_states.shape[-2]
+
+        # Only cache the last `self.sliding_window - 1` tokens (or all of them if lower than that)
+        self.keys = full_key_states[:, :, -self.sliding_window + 1 :, :]
+        self.values = full_value_states[:, :, -self.sliding_window + 1 :, :]
+
+        # Return the full states
+        return full_key_states, full_value_states
diff --git a/mindnlp/transformers/masking_utils.py b/mindnlp/transformers/masking_utils.py
@@ -463,6 +463,7 @@ def eager_mask(
     """
     # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
     _ = kwargs.pop("allow_is_causal_skip", None)
+
     mask = sdpa_mask(
         batch_size=batch_size,
         cache_position=cache_position,
@@ -785,12 +786,6 @@ def create_sliding_window_causal_mask(
     # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
     allow_is_causal_skip = not past_key_values.is_compileable if past_key_values is not None else True
 
-    # If we detected packing format
-    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
-        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
-        allow_is_causal_skip = False
-
-    # Allow slight deviations from sliding causal mask
     if or_mask_function is not None:
         if not _is_torch_greater_or_equal_than_2_6:
             raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
@@ -802,6 +797,12 @@ def create_sliding_window_causal_mask(
         mask_factory_function = and_masks(mask_factory_function, and_mask_function)
         allow_is_causal_skip = False
 
+
+    # If we detected packing format
+    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
+        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
+        allow_is_causal_skip = False
+
     # We now create the mask
     causal_mask = mask_interface(
         batch_size=batch_size,
diff --git a/mindnlp/utils/safetensors_patch.py b/mindnlp/utils/safetensors_patch.py
@@ -98,7 +98,6 @@ def get(self, slice=None):
         if not SUPPORT_BF16 and self.info["dtype"] == "BF16":
             array = array.astype(np.float16)
         tensor = mindtorch.from_numpy(array)
-        tensor._ptr = array.ctypes.data
         return tensor
 
     @property
diff --git a/mindtorch/_apis/cpu.py b/mindtorch/_apis/cpu.py
@@ -156,6 +156,8 @@ def pad_v3(input, new_pad, mode, value=None, contiguous=True):
 def cumsum(self, dim, dtype):
     if self.shape[dim] == 0:
         return mindspore.tensor([], dtype=self.dtype)
+    if self.dtype == mindspore.int64:
+        return cast(legacy.cum_sum(cast(self, mindspore.int32), dim, False, False), mindspore.int64)
     return legacy.cum_sum(self, dim, False, False)
 
 def reduce_any(input, axis, keepdims):
@@ -1228,7 +1230,7 @@ def search_sorted(sorted_sequence, values, sorter, dtype, right):
     return legacy.search_sorted(sorted_sequence, values, sorter, dtype, right)
 
 def scatter_nd_update(input, indices, updates):
-    return legacy.scatter_nd_update(input, indices, updates, True)
+    return legacy.scatter_nd_update(input, indices, cast(updates, input.dtype), True)
 
 def triu_indices(row, col, offset, dtype):
     return legacy.triu_indices(row, col, offset, dtype)
diff --git a/mindtorch/_apis/meta.py b/mindtorch/_apis/meta.py
@@ -62,21 +62,21 @@ def inplace_normal(input, *args):
 __all__.append('inplace_normal')
 
 def getitem(input, slice):
-    out = input.numpy()[slice]
+    out = np.zeros(input.shape)[slice]
     out = Tensor_(init='none', shape=out.shape, dtype=input.dtype)
     return mindtorch.Tensor(out)
 
 __all__.append('getitem')
 
-def sub(input, other, alpha):
+def sub(input, other, alpha=1.0):
     if isinstance(input, mindtorch.Tensor):
         return input
     return other
 
 __all__.append('sub')
 
 def pad_v3(input, pad, mode, value):
-    out = np.pad(input.numpy(), pad, mode, constant_values=value)
+    out = np.pad(np.zeros(input.shape), pad, mode, constant_values=value)
     out = Tensor_(init='none', shape=out.shape, dtype=input.dtype)
     return mindtorch.Tensor(out)
 
@@ -94,7 +94,7 @@ def cast(input, dtype):
 __all__.append('cast')
 
 def index_select(input, dim, index):
-    out = np.take(input.numpy(), index.numpy(), dim)
+    out = np.take(np.zeros(input.shape), np.zeros(index.shape, dtype=np.int64), dim)
     out = Tensor_(init='none', shape=out.shape, dtype=input.dtype)
     return mindtorch.Tensor(out)
 
@@ -146,6 +146,9 @@ def tril(input, k):
 __all__.append('tril')
 
 def reshape(input, shape):
+    if -1 in shape:
+        out = np.zeros(input.shape).reshape(shape)
+        shape = out.shape
     out = Tensor_(init='none', shape=tuple(shape), dtype=input.dtype)
     return mindtorch.Tensor(out)
 
@@ -414,4 +417,20 @@ def pad(input, pad, mode='constant', value=None):
         raise ValueError('pad size must be 2, 4 or 6')
  
     out = Tensor_(init='none', shape=new_size, dtype=input.dtype)
+    return mindtorch.Tensor(out)
+
+def setitem(self, slice, value):
+    return self
+
+def meshgrid(args, lambd):
+    res = np.meshgrid(*args, indexing=lambd)
+    outs = ()
+    for r in res:
+        out = Tensor_(init='none', shape=r.shape, dtype=args[0].dtype)
+        out = mindtorch.Tensor(out)
+        outs += (out,)
+    return outs
+
+def permute(input, dims):
+    out = Tensor_(init='none', shape=dims, dtype=input.dtype)
     return mindtorch.Tensor(out)
diff --git a/mindtorch/_apis/npu_310b.py b/mindtorch/_apis/npu_310b.py
@@ -1553,6 +1553,9 @@ def unique_dim(input, sorted, return_inverse, dim):
     return legacy.unique_dim(input, sorted, return_inverse, dim)
 
 def inplace_add(input, other, alpha):
+    if isinstance(other, numbers.Number):
+        other = mindspore.Tensor(other, dtype=input.dtype)
+
     if ENABLE_PYBOOST:
         return pyboost.inplace_add_ext_op(input, other, alpha)
     return legacy.inplace_add(input, other)
diff --git a/mindtorch/_apis/npu_910a.py b/mindtorch/_apis/npu_910a.py
@@ -1124,7 +1124,7 @@ def sqrt(input):
     return legacy.sqrt(input)
 
 def masked_scatter(input, mask, value):
-    return legacy.masked_scatter(input, mask, value)
+    return legacy.masked_scatter(input, mask, cast(value, input.dtype))
 
 def neg(input):
     if ENABLE_PYBOOST:
@@ -1532,6 +1532,8 @@ def unique_dim(input, sorted, return_inverse, dim):
     return legacy.unique_dim(input, sorted, return_inverse, dim)
 
 def inplace_add(input, other, alpha):
+    if isinstance(other, numbers.Number):
+        other = mindspore.Tensor(other, dtype=input.dtype)
     if ENABLE_PYBOOST:
         return pyboost.inplace_add_ext_op(input, other, alpha)
     return legacy.inplace_add(input, other)
@@ -1788,6 +1790,11 @@ def log2(input):
     return legacy.log2(input)
 
 def bucketize(input, boundaries, right=False):
+    if isinstance(boundaries, mindtorch.Tensor):
+        boundaries = boundaries.tolist()
+    
+    if not boundaries:
+        return zeros_like(input)
     epsilon_ = 0. if right else 1.e-6
     boundaries = [boundary + epsilon_ for boundary in boundaries]
     return legacy.bucketize(input, boundaries)
@@ -2095,13 +2102,20 @@ def _process_dim_in_multi_dim_index(prev_result, orig_tensor, index, dim, indexe
                 result = _do_select(prev_result, dim, index.item(), dim_index, prev_shape)
                 del prev_shape[dim]
                 return result, dim, remain_indexes, prev_shape
+
             # process index with Tensor bool type
             result = expand_dims(prev_result, dim)
             index_for_bool = tensor_1d if index else empty_tensor_1d
             _record_tensor_index(index_for_bool, remain_indexes, dim)
             prev_shape.insert(dim, 1)
             dim += 1
             return result, dim, remain_indexes, prev_shape
+
+        if index.dtype == mindtorch.bool and prev_result.ndim == 1:
+            result = masked_select(prev_result, index)
+            dim += 1
+            return result, dim, remain_indexes, prev_shape
+
         _record_tensor_index(index, remain_indexes, dim)
         dim += 1
         return result, dim, remain_indexes, prev_shape
diff --git a/mindtorch/_apis/npu_910b.py b/mindtorch/_apis/npu_910b.py
@@ -1527,6 +1527,9 @@ def unique_dim(input, sorted, return_inverse, dim):
     return legacy.unique_dim(input, sorted, return_inverse, dim)
 
 def inplace_add(input, other, alpha):
+    if isinstance(other, numbers.Number):
+        other = mindspore.Tensor(other, dtype=input.dtype)
+
     if ENABLE_PYBOOST:
         return pyboost.inplace_add_ext_op(input, other, alpha)
     return legacy.inplace_add(input, other)
diff --git a/mindtorch/_tensor.py b/mindtorch/_tensor.py
@@ -19,7 +19,7 @@
 from ._bind import get_device_in_context, device_, get_default_dtype
 from ._utils import _rebuild_tensor_v2
 from ._C.size import Size
-from .configs import DEVICE_TARGET, cpu_use_numpy, ON_ORANGE_PI
+from .configs import DEVICE_TARGET
 from .executor import execute
 
 device_map = {
@@ -159,7 +159,11 @@ def npu(self, device=None, non_blocking=False):
         return super(Tensor, self).to('Ascend', non_blocking=non_blocking)
 
     def cuda(self, device=None, non_blocking=False):
-        return super(Tensor, self).to('GPU', non_blocking=non_blocking)
+        if DEVICE_TARGET == 'Ascend':
+            device_type = 'Ascend'
+        else:
+            device_type = 'GPU'
+        return super(Tensor, self).to(device_type, non_blocking=non_blocking)
 
     def __array_wrap__(self, array):
         if array.dtype == bool:
@@ -215,7 +219,6 @@ def __getitem__(self, slices):
             slices = new_slices
 
         out = execute('getitem', self, slices)
-
         return out
 
     def __setitem__(self, slices, value):
@@ -1725,8 +1728,8 @@ def numpy(self):
     def __array__(self, dtype=None):
         """support create numpy array from tensor."""
         if dtype is None:
-            return self.numpy()
-        return self.numpy().astype(dtype, copy=False)
+            return self.asnumpy()
+        return self.asnumpy().astype(dtype, copy=False)
 
 
     def mindspore(self):
@@ -2175,6 +2178,10 @@ def _move_to(self, device, non_blocking=False):
             device_str = device_map[device]
         else:
             device_str = device
+
+        if DEVICE_TARGET == 'Ascend' and device_str == 'GPU':
+            device_str = 'Ascend'
+
         if device == self._device:
             return self
         return super(Tensor, self).to(device_str, non_blocking=non_blocking)
diff --git a/mindtorch/executor.py b/mindtorch/executor.py
@@ -31,7 +31,7 @@ def execute(func_name, *args, **kwargs):
     device_type = kwargs.pop('device', None)
 
     if device_type is None:
-        if ENABLE_DISPATCH:
+        if ENABLE_DISPATCH or 'inplace' in func_name:
             if device_from_list:
                 device_type = args[0][0]._device
             else:
diff --git a/mindtorch/ops/inplace.py b/mindtorch/ops/inplace.py
@@ -59,8 +59,6 @@ def inplace_uniform(input, *args, **kwargs):
     return input
 
 def inplace_add(input, other, alpha):
-    if isinstance(other, numbers.Number):
-        other = mindtorch.tensor(other, dtype=input.dtype, device=input._device)
     execute('inplace_add', input, other, alpha)
     return input
 
diff --git a/mindtorch/ops/other.py b/mindtorch/ops/other.py
@@ -62,11 +62,6 @@ def broadcast_shapes(*shapes):
 
 # bucketize
 def bucketize(input, boundaries, *, out_int32=False, right=False, out=None):
-    # if isinstance(boundaries, mindtorch.Tensor):
-    #     boundaries = boundaries.tolist()
-    
-    # if not boundaries:
-    #     return mindtorch.zeros_like(input)
     out = execute('bucketize', input, boundaries, right)
     if out_int32:
         return out.to(mindtorch.int32)
diff --git a/mindtorch/ops/pointwise.py b/mindtorch/ops/pointwise.py
@@ -432,7 +432,7 @@ def pow(input, exponent):
     if isinstance(exponent, numbers.Number):
         return execute("pow_tensor_scalar", input, exponent)
     if isinstance(input, numbers.Number):
-        return execute("pow_scalar_tensor", input, exponent)
+        return execute("pow_scalar_tensor", input, exponent, device_position=1)
     return execute("pow", input, exponent)