Revert regression in DimShuffle C-impl speed

ricardoV94 · ricardoV94 · commit ebc4263820e3 · 2025-02-20T14:00:59.000+01:00
Introduced in e593b0a due to a bug when inputs had zero-strides. The bug can be fixed just by removing a block that assumed some `full`/`broadcasting` behavior by the operation, but this is not happening with DimShuffle.
diff --git a/pytensor/tensor/c_code/dimshuffle.c b/pytensor/tensor/c_code/dimshuffle.c
@@ -1,82 +1,93 @@
 #section support_code_apply
 
-int APPLY_SPECIFIC(cpu_dimshuffle)(PyArrayObject *input, PyArrayObject **res,
-                                   PARAMS_TYPE *params) {
-
-  // This points to either the original input or a copy we create below.
-  // Either way, this is what we should be working on/with.
-  PyArrayObject *_input;
-
-  if (*res)
-    Py_XDECREF(*res);
-
-  if (params->inplace) {
-    _input = input;
-    Py_INCREF((PyObject *)_input);
-  } else {
-    _input = (PyArrayObject *)PyArray_FromAny(
-        (PyObject *)input, NULL, 0, 0, NPY_ARRAY_ALIGNED | NPY_ARRAY_ENSURECOPY,
-        NULL);
-  }
-
-  PyArray_Dims permute;
-
-  if (!PyArray_IntpConverter((PyObject *)params->transposition, &permute)) {
-    return 1;
-  }
-
-  /*
-    res = res.transpose(self.transposition)
-  */
-  PyArrayObject *transposed_input =
-      (PyArrayObject *)PyArray_Transpose(_input, &permute);
-
-  Py_DECREF(_input);
-
-  PyDimMem_FREE(permute.ptr);
+int APPLY_SPECIFIC(cpu_dimshuffle)(PyArrayObject *input, PyArrayObject **res, PARAMS_TYPE *params) {
+    npy_int64* new_order;
+    npy_intp nd_in;
+    npy_intp nd_out;
+    npy_intp* dimensions;
+    npy_intp* strides;
+
+    // This points to either the original input or a copy we create below.
+    // Either way, this is what we should be working on/with.
+    PyArrayObject *_input;
+
+    if (!PyArray_IS_C_CONTIGUOUS(params->_new_order)) {
+        PyErr_SetString(PyExc_RuntimeError, "DimShuffle: param _new_order must be C-contiguous.");
+        return 1;
+    }
+    new_order = (npy_int64*) PyArray_DATA(params->_new_order);
+    nd_in = (npy_intp)(params->input_ndim);
+    nd_out = PyArray_SIZE(params->_new_order);
 
-  npy_intp *res_shape = PyArray_DIMS(transposed_input);
-  npy_intp N_shuffle = PyArray_SIZE(params->shuffle);
-  npy_intp N_augment = PyArray_SIZE(params->augment);
-  npy_intp N = N_augment + N_shuffle;
-  npy_intp *_reshape_shape = PyDimMem_NEW(N);
+    if (PyArray_NDIM(input) != nd_in) {
+        PyErr_SetString(PyExc_NotImplementedError, "DimShuffle: Input has less dimensions than expected.");
+        return 1;
+    }
 
-  if (_reshape_shape == NULL) {
-    PyErr_NoMemory();
-    return 1;
-  }
+    if (*res)
+        Py_XDECREF(*res);
 
-  /*
-    shape = list(res.shape[: len(self.shuffle)])
-    for augm in self.augment:
-        shape.insert(augm, 1)
-  */
-  npy_intp aug_idx = 0;
-  int res_idx = 0;
-  for (npy_intp i = 0; i < N; i++) {
-    if (aug_idx < N_augment &&
-        i == *((npy_intp *)PyArray_GetPtr(params->augment, &aug_idx))) {
-      _reshape_shape[i] = 1;
-      aug_idx++;
+    if (params->inplace) {
+        _input = input;
+        Py_INCREF((PyObject*)_input);
     } else {
-      _reshape_shape[i] = res_shape[res_idx];
-      res_idx++;
+        _input = (PyArrayObject *)PyArray_FromAny(
+            (PyObject *)input, NULL, 0, 0, NPY_ARRAY_ALIGNED | NPY_ARRAY_ENSURECOPY,
+            NULL);
     }
-  }
 
-  PyArray_Dims reshape_shape = {.ptr = _reshape_shape, .len = (int)N};
+    // Compute new dimensions and strides
+    dimensions = (npy_intp*) malloc(nd_out * sizeof(npy_intp));
+    strides = (npy_intp*) malloc(nd_out * sizeof(npy_intp));
+    if (dimensions == NULL || strides == NULL) {
+        PyErr_NoMemory();
+        free(dimensions);
+        free(strides);
+        return 1;
+    };
+
+    npy_intp original_size = PyArray_SIZE(_input);
+    npy_intp new_size = 1;
+    for (npy_intp i = 0; i < nd_out; ++i) {
+        if (new_order[i] != -1) {
+            dimensions[i] = PyArray_DIMS(_input)[new_order[i]];
+            strides[i] = PyArray_DIMS(_input)[new_order[i]] == 1 ? 0 : PyArray_STRIDES(_input)[new_order[i]];
+        } else {
+            dimensions[i] = 1;
+            strides[i] = 0;
+        }
+        new_size *= dimensions[i];
+    }
 
-  /* res = res.reshape(shape) */
-  *res = (PyArrayObject *)PyArray_Newshape(transposed_input, &reshape_shape,
-                                           NPY_CORDER);
+    if (original_size != new_size) {
+        PyErr_SetString(PyExc_ValueError, "DimShuffle: Attempting to squeeze axes with size not equal to one.");
+        free(dimensions);
+        free(strides);
+        return 1;
+    }
 
-  Py_DECREF(transposed_input);
+    // Create the new array.
+    *res = (PyArrayObject*)PyArray_New(&PyArray_Type, nd_out, dimensions,
+                                       PyArray_TYPE(_input), strides,
+                                       PyArray_DATA(_input), PyArray_ITEMSIZE(_input),
+                                       // borrow only the writable flag from the base
+                                       // the NPY_OWNDATA flag will default to 0.
+                                       (NPY_ARRAY_WRITEABLE * PyArray_ISWRITEABLE(_input)),
+                                       NULL);
+
+    if (*res == NULL) {
+        free(dimensions);
+        free(strides);
+        return 1;
+    }
 
-  PyDimMem_FREE(reshape_shape.ptr);
+    // recalculate flags: CONTIGUOUS, FORTRAN, ALIGNED
+    PyArray_UpdateFlags(*res, NPY_ARRAY_UPDATE_ALL);
 
-  if (!*res) {
-    return 1;
-  }
+    // we are making a view in both inplace and non-inplace cases
+    PyArray_SetBaseObject(*res, (PyObject*)_input);
 
-  return 0;
-}
+    free(strides);
+    free(dimensions);
+    return 0;
+}
diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py
@@ -21,7 +21,7 @@
 from pytensor.scalar import get_scalar_type
 from pytensor.scalar.basic import bool as scalar_bool
 from pytensor.scalar.basic import identity as scalar_identity
-from pytensor.scalar.basic import transfer_type, upcast
+from pytensor.scalar.basic import int64, transfer_type, upcast
 from pytensor.tensor import elemwise_cgen as cgen
 from pytensor.tensor import get_vector_length
 from pytensor.tensor.basic import _get_vector_length, as_tensor_variable
@@ -121,12 +121,18 @@ class DimShuffle(ExternalCOp):
     @property
     def params_type(self):
         return ParamsType(
-            shuffle=lvector,
-            augment=lvector,
-            transposition=lvector,
+            _new_order=lvector,
             inplace=scalar_bool,
+            input_ndim=int64,
         )
 
+    @property
+    def _new_order(self):
+        # Param for C code.
+        # self.new_order may contain 'x', which is not a valid integer value.
+        # We replace it with -1.
+        return [(-1 if x == "x" else x) for x in self.new_order]
+
     def __init__(self, *, input_ndim: int, new_order: Sequence[int | Literal["x"]]):
         super().__init__([self.c_func_file], self.c_func_name)
 
diff --git a/tests/tensor/test_elemwise.py b/tests/tensor/test_elemwise.py
@@ -1,3 +1,4 @@
+import itertools
 import math
 import re
 import tracemalloc
@@ -10,6 +11,7 @@
 import pytensor.scalar as ps
 import pytensor.tensor as pt
 import tests.unittest_tools as utt
+from pytensor import In, Out
 from pytensor.compile.function import function
 from pytensor.compile.mode import Mode
 from pytensor.configdefaults import config
@@ -35,6 +37,7 @@
     matrix,
     scalar,
     tensor,
+    tensor3,
     vector,
     vectors,
 )
@@ -158,11 +161,14 @@ def test_c_views(self):
         # as the broadcasted value; that way, we'll be able to tell that we're getting
         # junk data from a poorly constructed array view.
         x_val = np.broadcast_to(2039, (5000,))
-        for i in range(1000):
+        expected_x_val = x_val[None]
+        for i in range(1):
             inputs[0].storage[0] = x_val
             thunk()
             # Make sure it's a view of the original data
             assert np.shares_memory(x_val, outputs[0].storage[0])
+            # Confirm the right strides
+            assert outputs[0].storage[0].strides == expected_x_val.strides
             # Confirm the broadcasted value in the output
             assert np.array_equiv(outputs[0].storage[0], 2039)
 
@@ -212,6 +218,25 @@ def test_valid_input_ndim(self):
         with pytest.raises(TypeError, match="input_ndim must be an integer"):
             DimShuffle(input_ndim=(True, False), new_order=(1, 0))
 
+    def test_benchmark(self, benchmark):
+        x = tensor3("x")
+        x_val = np.random.random((2, 3, 4)).astype(config.floatX)
+        ys = [x.transpose(t) for t in itertools.permutations((0, 1, 2))]
+        ys += [
+            x[None],
+            x[:, None],
+            x[:, :, None],
+            x[:, :, :, None],
+        ]
+        # Borrow to avoid deepcopy overhead
+        fn = pytensor.function(
+            [In(x, borrow=True)],
+            [Out(y, borrow=True) for y in ys],
+        )
+        fn.dprint()
+        fn.trust_input = True
+        benchmark(fn, x_val)
+
 
 class TestBroadcast:
     # this is to allow other types to reuse this class to test their ops