Merge remote-tracking branch 'origin/master' into elementwise-floor-ceil-trunc

oleksandr-pavlyk · oleksandr-pavlyk · commit c4f972be2b70 · 2023-07-17T14:20:17.000-05:00
diff --git a/.github/workflows/generate-coverage.yaml b/.github/workflows/generate-coverage.yaml
@@ -79,7 +79,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
+          pip install numpy cython"<3" setuptools pytest pytest-cov scikit-build cmake coverage[toml]
 
       - name: Build dpctl with coverage
         shell: bash -l {0}
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
@@ -49,7 +49,7 @@ jobs:
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
+          pip install numpy cython"<3" setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
@@ -108,7 +108,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake
+          pip install numpy cython"<3" setuptools pytest scikit-build cmake
 
       - name: Checkout repo
         uses: actions/checkout@v3
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -20,7 +20,7 @@ requirements:
         - cmake  >=3.21
         - ninja
         - git
-        - cython
+        - cython  <3
         - python
         - scikit-build
         - numpy
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
@@ -213,6 +213,11 @@ def _copy_same_shape(dst, src):
     """Assumes src and dst have the same shape."""
     # check that memory regions do not overlap
     if ti._array_overlap(dst, src):
+        if src._pointer == dst._pointer and (
+            src is dst
+            or (src.strides == dst.strides and src.dtype == dst.dtype)
+        ):
+            return
         _copy_overlapping(src=src, dst=dst)
         return
 
diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
@@ -52,6 +52,20 @@ def __call__(self, x, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
             raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
 
+        if order not in ["C", "F", "K", "A"]:
+            order = "K"
+        buf_dt, res_dt = _find_buf_dtype(
+            x.dtype, self.result_type_resolver_fn_, x.sycl_device
+        )
+        if res_dt is None:
+            raise TypeError(
+                f"function '{self.name_}' does not support input type "
+                f"({x.dtype}), "
+                "and the input could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
         if out is not None:
             if not isinstance(out, dpt.usm_ndarray):
                 raise TypeError(
@@ -64,8 +78,21 @@ def __call__(self, x, out=None, order="K"):
                     f"Expected output shape is {x.shape}, got {out.shape}"
                 )
 
-            if ti._array_overlap(x, out):
-                raise TypeError("Input and output arrays have memory overlap")
+            if res_dt != out.dtype:
+                raise TypeError(
+                    f"Output array of type {res_dt} is needed,"
+                    f" got {out.dtype}"
+                )
+
+            if (
+                buf_dt is None
+                and ti._array_overlap(x, out)
+                and not ti._same_logical_tensors(x, out)
+            ):
+                # Allocate a temporary buffer to avoid memory overlapping.
+                # Note if `buf_dt` is not None, a temporary copy of `x` will be
+                # created, so the array overlap check isn't needed.
+                out = dpt.empty_like(out)
 
             if (
                 dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
@@ -75,18 +102,6 @@ def __call__(self, x, out=None, order="K"):
                     "Input and output allocation queues are not compatible"
                 )
 
-        if order not in ["C", "F", "K", "A"]:
-            order = "K"
-        buf_dt, res_dt = _find_buf_dtype(
-            x.dtype, self.result_type_resolver_fn_, x.sycl_device
-        )
-        if res_dt is None:
-            raise TypeError(
-                f"function '{self.name_}' does not support input type "
-                f"({x.dtype}), "
-                "and the input could not be safely coerced to any "
-                "supported types according to the casting rule ''safe''."
-            )
         exec_q = x.sycl_queue
         if buf_dt is None:
             if out is None:
@@ -96,17 +111,20 @@ def __call__(self, x, out=None, order="K"):
                     if order == "A":
                         order = "F" if x.flags.f_contiguous else "C"
                     out = dpt.empty_like(x, dtype=res_dt, order=order)
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f" got {out.dtype}"
-                    )
 
-            ht, _ = self.unary_fn_(x, out, sycl_queue=exec_q)
-            ht.wait()
+            ht_unary_ev, unary_ev = self.unary_fn_(x, out, sycl_queue=exec_q)
+
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev]
+                )
+                ht_copy_ev.wait()
+                out = orig_out
 
+            ht_unary_ev.wait()
             return out
+
         if order == "K":
             buf = _empty_like_orderK(x, buf_dt)
         else:
@@ -122,11 +140,6 @@ def __call__(self, x, out=None, order="K"):
                 out = _empty_like_orderK(buf, res_dt)
             else:
                 out = dpt.empty_like(buf, dtype=res_dt, order=order)
-        else:
-            if buf_dt != out.dtype:
-                raise TypeError(
-                    f"Output array of type {buf_dt} is needed, got {out.dtype}"
-                )
 
         ht, _ = self.unary_fn_(buf, out, sycl_queue=exec_q, depends=[copy_ev])
         ht_copy_ev.wait()
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -100,6 +100,53 @@ struct MemoryOverlap
     }
 };
 
+struct SameLogicalTensors
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        // Same ndim
+        int nd1 = ar1.get_ndim();
+        if (nd1 != ar2.get_ndim())
+            return false;
+
+        // Same dtype
+        int tn1 = ar1.get_typenum();
+        if (tn1 != ar2.get_typenum())
+            return false;
+
+        // Same pointer
+        const char *ar1_data = ar1.get_data();
+        const char *ar2_data = ar2.get_data();
+
+        if (ar1_data != ar2_data)
+            return false;
+
+        // Same shape and strides
+        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
+        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
+
+        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
+            return false;
+
+        // Same shape and strides
+        auto const &ar1_strides = ar1.get_strides_vector();
+        auto const &ar2_strides = ar2.get_strides_vector();
+
+        auto ar1_beg_it = std::begin(ar1_strides);
+        auto ar1_end_it = std::end(ar1_strides);
+
+        auto ar2_beg_it = std::begin(ar2_strides);
+
+        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
+            return false;
+
+        // all checks passed: arrays are logical views
+        // into the same memory
+        return true;
+    }
+};
+
 } // namespace overlap
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
@@ -128,7 +128,9 @@ py_unary_ufunc(dpctl::tensor::usm_ndarray src,
 
     // check memory overlap
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -60,6 +60,7 @@ using dpctl::tensor::c_contiguous_strides;
 using dpctl::tensor::f_contiguous_strides;
 
 using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
 
 using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
 
@@ -338,6 +339,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array overlap",
           py::arg("array1"), py::arg("array2"));
 
+    auto same_logical_tensors = [](dpctl::tensor::usm_ndarray x1,
+                                   dpctl::tensor::usm_ndarray x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
+
     m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
           py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
diff --git a/dpctl/tests/_numpy_warnings.py b/dpctl/tests/_numpy_warnings.py
@@ -0,0 +1,28 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import pytest
+
+
+@pytest.fixture
+def suppress_invalid_numpy_warnings():
+    # invalid: treatment for invalid floating-point operation
+    # (result is not an expressible number, typically indicates
+    # that a NaN was produced)
+    old_settings = numpy.seterr(invalid="ignore")
+    yield
+    numpy.seterr(**old_settings)  # reset to default
diff --git a/dpctl/tests/conftest.py b/dpctl/tests/conftest.py
@@ -26,8 +26,15 @@
     invalid_filter,
     valid_filter,
 )
+from _numpy_warnings import suppress_invalid_numpy_warnings
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "helper"))
 
 # common fixtures
-__all__ = ["check", "device_selector", "invalid_filter", "valid_filter"]
+__all__ = [
+    "check",
+    "device_selector",
+    "invalid_filter",
+    "suppress_invalid_numpy_warnings",
+    "valid_filter",
+]
diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
@@ -22,7 +22,7 @@
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _usm_types
+from .utils import _all_dtypes, _no_complex_dtypes, _usm_types
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -113,3 +113,25 @@ def test_abs_complex(dtype):
             np.testing.assert_allclose(
                 dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
             )
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_abs_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.abs(Xnp, out=Xnp)
+
+    Y = dpt.abs(X, out=X)
+    assert Y is X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+
+    Ynp = np.abs(Xnp, out=Xnp[::-1])
+    Y = dpt.abs(X, out=X[::-1])
+    assert Y is not X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+    assert np.allclose(dpt.asnumpy(Y), Ynp)
diff --git a/dpctl/tests/elementwise/test_exp.py b/dpctl/tests/elementwise/test_exp.py
@@ -145,3 +145,26 @@ def test_exp_strided(dtype):
                 atol=tol,
                 rtol=tol,
             )
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 1, 15, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.exp(Xnp, out=Xnp)
+
+    Y = dpt.exp(X, out=X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert Y is X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+
+    Ynp = np.exp(Xnp, out=Xnp[::-1])
+    Y = dpt.exp(X, out=X[::-1])
+    assert Y is not X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_log.py b/dpctl/tests/elementwise/test_log.py
diff --git a/dpctl/tests/elementwise/test_sincos.py b/dpctl/tests/elementwise/test_sincos.py
diff --git a/dpctl/tests/elementwise/test_sqrt.py b/dpctl/tests/elementwise/test_sqrt.py
diff --git a/dpctl/tests/elementwise/test_square.py b/dpctl/tests/elementwise/test_square.py
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,9 @@ py_unary_ufunc(dpctl::tensor::usm_ndarray src,`
`128`	`128`
`129`	`129`	`// check memory overlap`
`130`	`130`	`auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();`
`131`		`- if (overlap(src, dst)) {`
	`131`	`+ auto const &same_logical_tensors =`
	`132`	`+ dpctl::tensor::overlap::SameLogicalTensors();`
	`133`	`+ if (overlap(src, dst) && !same_logical_tensors(src, dst)) {`
`132`	`134`	`throw py::value_error("Arrays index overlapping segments of memory");`
`133`	`135`	`}`
`134`	`136`