Adding Support for Offloading C++ standard algorithms #116869

xevor11 · 2024-11-19T19:59:36Z

A separate PR based on the suggestions by @ldionne referenced in #66968

github-actions · 2024-11-19T19:59:55Z

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

llvmbot · 2024-11-19T20:00:32Z

@llvm/pr-subscribers-libcxx

@llvm/pr-subscribers-libcxxabi

Author: Vedant Tewari (xevor11)

Changes

Patch is 371.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116869.diff

36 Files Affected:

(modified) .github/workflows/libcxx-build-and-test.yaml (+1)
(modified) libcxx/CMakeLists.txt (+10-2)
(added) libcxx/cmake/caches/Generic-pstl-openmp.cmake (+1)
(modified) libcxx/docs/UserDocumentation.rst (+207-132)
(modified) libcxx/docs/VendorDocumentation.rst (+227-180)
(modified) libcxx/include/CMakeLists.txt (+1)
(modified) libcxx/include/__algorithm/ranges_find_last.h (+55)
(added) libcxx/include/__algorithm/ranges_find_last_if.h (+81)
(added) libcxx/include/__algorithm/ranges_find_last_if_not.h (+81)
(added) libcxx/include/__algorithm/ranges_shift_left.h (+74)
(added) libcxx/include/__algorithm/ranges_shift_right.h (+75)
(modified) libcxx/include/__config_site.in (+1)
(modified) libcxx/include/__pstl/backend.h (+14-14)
(modified) libcxx/include/__pstl/backend_fwd.h (+10-10)
(added) libcxx/include/__pstl/backends/openmp.h (+511)
(modified) libcxx/include/__pstl/dispatch.h (+15)
(modified) libcxx/include/module.modulemap (+2064-2236)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp (+52)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp (+67)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp (+36)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp (+39)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp (+36)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp (+49)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp (+39)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp (+63)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp (+49)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp (+21)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp (+21)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp (+21)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp (+55)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp (+41)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp (+199)
(modified) libcxx/utils/ci/run-buildbot (+166-176)
(modified) libcxx/utils/libcxx/test/features.py (+104-178)
(modified) libcxx/utils/run.py (+15)
(modified) libcxxabi/CMakeLists.txt (+8)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 2184ddd49537b5..9e483612bc9943 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -158,6 +158,7 @@ jobs:
           'generic-no-wide-characters',
           'generic-no-rtti',
           'generic-optimized-speed',
+          'generic-pstl-openmp',
           'generic-static',
           'bootstrapping-build'
         ]
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index abe12c2805a7cf..dee2a75f74d89f 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -300,10 +300,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
 if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
 else()
-  set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
 
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -552,6 +553,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -784,6 +790,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
   config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
 elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std_thread and libdispatch")
diff --git a/libcxx/cmake/caches/Generic-pstl-openmp.cmake b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
new file mode 100644
index 00000000000000..f3ff4f3b57fd21
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
diff --git a/libcxx/docs/UserDocumentation.rst b/libcxx/docs/UserDocumentation.rst
index 2c1bc1373659c3..f1e7b19ead5798 100644
--- a/libcxx/docs/UserDocumentation.rst
+++ b/libcxx/docs/UserDocumentation.rst
@@ -1,17 +1,19 @@
-.. _user-documentation:
+.. _using-libcxx:
 
-==================
-User documentation
-==================
+============
+Using libc++
+============
 
 .. contents::
   :local:
 
+Usually, libc++ is packaged and shipped by a vendor through some delivery vehicle
+(operating system distribution, SDK, toolchain, etc) and users don't need to do
+anything special in order to use the library.
+
 This page contains information about configuration knobs that can be used by
 users when they know libc++ is used by their toolchain, and how to use libc++
-when it is not the default library used by their toolchain. It is aimed at
-users of libc++: a separate page contains documentation aimed at vendors who
-build and ship libc++ as part of their toolchain.
+when it is not the default library used by their toolchain.
 
 
 Using a different version of the C++ Standard
@@ -26,29 +28,10 @@ matches that Standard in the library.
 
   $ clang++ -std=c++17 test.cpp
 
-Note that using ``-std=c++XY`` with a version of the Standard that has not been ratified
-yet is considered unstable. While we strive to maintain stability, libc++ may be forced to
-make breaking changes to features shipped in a Standard that hasn't been ratified yet. Use
-these versions of the Standard at your own risk.
-
-
-Using libc++ when it is not the system default
-==============================================
-
-Usually, libc++ is packaged and shipped by a vendor through some delivery vehicle
-(operating system distribution, SDK, toolchain, etc) and users don't need to do
-anything special in order to use the library.
-
-On systems where libc++ is provided but is not the default, Clang provides a flag
-called ``-stdlib=`` that can be used to decide which standard library is used.
-Using ``-stdlib=libc++`` will select libc++:
-
-.. code-block:: bash
-
-  $ clang++ -stdlib=libc++ test.cpp
-
-On systems where libc++ is the library in use by default such as macOS and FreeBSD,
-this flag is not required.
+.. warning::
+  Using ``-std=c++XY`` with a version of the Standard that has not been ratified yet
+  is considered unstable. Libc++ reserves the right to make breaking changes to the
+  library until the standard has been ratified.
 
 
 Enabling experimental C++ Library features
@@ -60,19 +43,15 @@ the Standard but whose implementation is not complete or stable yet in libc++. T
 are disabled by default because they are neither API nor ABI stable. However, the
 ``-fexperimental-library`` compiler flag can be defined to turn those features on.
 
-On compilers that do not support the ``-fexperimental-library`` flag (such as GCC),
-users can define the ``_LIBCPP_ENABLE_EXPERIMENTAL`` macro and manually link against
-the appropriate static library (usually shipped as ``libc++experimental.a``) to get
-access to experimental library features.
-
 The following features are currently considered experimental and are only provided
 when ``-fexperimental-library`` is passed:
 
 * The parallel algorithms library (``<execution>`` and the associated algorithms)
+* ``std::stop_token``, ``std::stop_source`` and ``std::stop_callback``
+* ``std::jthread``
 * ``std::chrono::tzdb`` and related time zone functionality
-* ``<syncstream>``
 
-.. note::
+.. warning::
   Experimental libraries are experimental.
     * The contents of the ``<experimental/...>`` headers and the associated static
       library will not remain compatible between versions.
@@ -81,18 +60,98 @@ when ``-fexperimental-library`` is passed:
       the experimental feature is removed two releases after the non-experimental
       version has shipped. The full policy is explained :ref:`here <experimental features>`.
 
+.. note::
+  On compilers that do not support the ``-fexperimental-library`` flag, users can
+  define the ``_LIBCPP_ENABLE_EXPERIMENTAL`` macro and manually link against the
+  appropriate static library (usually shipped as ``libc++experimental.a``) to get
+  access to experimental library features.
 
-Libc++ Configuration Macros
+
+Using libc++ when it is not the system default
+==============================================
+
+On systems where libc++ is provided but is not the default, Clang provides a flag
+called ``-stdlib=`` that can be used to decide which standard library is used.
+Using ``-stdlib=libc++`` will select libc++:
+
+.. code-block:: bash
+
+  $ clang++ -stdlib=libc++ test.cpp
+
+On systems where libc++ is the library in use by default such as macOS and FreeBSD,
+this flag is not required.
+
+
+.. _alternate libcxx:
+
+Using a custom built libc++
 ===========================
 
-Libc++ provides a number of configuration macros that can be used by developers to
-enable or disable extended libc++ behavior.
+Most compilers provide a way to disable the default behavior for finding the
+standard library and to override it with custom paths. With Clang, this can
+be done with:
 
-.. warning::
-  Configuration macros that are not documented here are not intended to be customized
-  by developers and should not be used. In particular, some configuration macros are
-  only intended to be used by vendors and changing their value from the one provided
-  in your toolchain can lead to unexpected behavior.
+.. code-block:: bash
+
+  $ clang++ -nostdinc++ -nostdlib++           \
+            -isystem <install>/include/c++/v1 \
+            -L <install>/lib                  \
+            -Wl,-rpath,<install>/lib          \
+            -lc++                             \
+            test.cpp
+
+The option ``-Wl,-rpath,<install>/lib`` adds a runtime library search path,
+which causes the system's dynamic linker to look for libc++ in ``<install>/lib``
+whenever the program is loaded.
+
+GCC does not support the ``-nostdlib++`` flag, so one must use ``-nodefaultlibs``
+instead. Since that removes all the standard system libraries and not just libc++,
+the system libraries must be re-added manually. For example:
+
+.. code-block:: bash
+
+  $ g++ -nostdinc++ -nodefaultlibs           \
+        -isystem <install>/include/c++/v1    \
+        -L <install>/lib                     \
+        -Wl,-rpath,<install>/lib             \
+        -lc++ -lc++abi -lm -lc -lgcc_s -lgcc \
+        test.cpp
+
+
+GDB Pretty printers for libc++
+==============================
+
+GDB does not support pretty-printing of libc++ symbols by default. However, libc++ does
+provide pretty-printers itself. Those can be used as:
+
+.. code-block:: bash
+
+  $ gdb -ex "source <libcxx>/utils/gdb/libcxx/printers.py" \
+        -ex "python register_libcxx_printer_loader()" \
+        <args>
+
+.. _include-what-you-use:
+
+include-what-you-use (IWYU)
+===========================
+
+libc++ provides an IWYU `mapping file <https://github.com/include-what-you-use/include-what-you-use/blob/master/docs/IWYUMappings.md>`_,
+which drastically improves the accuracy of the tool when using libc++. To use the mapping file with
+IWYU, you should run the tool like so:
+
+.. code-block:: bash
+
+  $ include-what-you-use -Xiwyu --mapping_file=/path/to/libcxx/include/libcxx.imp file.cpp
+
+If you would prefer to not use that flag, then you can replace ``/path/to/include-what-you-use/share/libcxx.imp``
+file with the libc++-provided ``libcxx.imp`` file.
+
+Libc++ Configuration Macros
+===========================
+
+Libc++ provides a number of configuration macros which can be used to enable
+or disable extended libc++ behavior, including enabling hardening or thread
+safety annotations.
 
 **_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS**:
   This macro is used to enable -Wthread-safety annotations on libc++'s
@@ -134,12 +193,6 @@ enable or disable extended libc++ behavior.
   warning saying that `std::auto_ptr` is deprecated. If the macro is defined,
   no warning will be emitted. By default, this macro is not defined.
 
-**_LIBCPP_ENABLE_EXPERIMENTAL**:
-  This macro enables experimental features. This can be used on compilers that do
-  not support the ``-fexperimental-library`` flag. When used, users also need to
-  ensure that the appropriate experimental library (usually ``libc++experimental.a``)
-  is linked into their program.
-
 C++17 Specific Configuration Macros
 -----------------------------------
 **_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR**:
@@ -156,18 +209,12 @@ C++17 Specific Configuration Macros
 **_LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE**:
   This macro is used to re-enable the `random_shuffle` algorithm.
 
-**_LIBCPP_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION**:
-  This macro is used to re-enable `unary_function` and `binary_function`.
-
 **_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS**:
   This macro is used to re-enable `set_unexpected`, `get_unexpected`, and
   `unexpected`.
 
 C++20 Specific Configuration Macros
 -----------------------------------
-**_LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION**:
-  This macro is used to re-enable `uncaught_exception`.
-
 **_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE**:
   This macro is used to re-enable the function
   ``std::shared_ptr<...>::unique()``.
@@ -184,9 +231,6 @@ C++20 Specific Configuration Macros
 **_LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR**:
   This macro is used to re-enable `raw_storage_iterator`.
 
-**_LIBCPP_ENABLE_CXX20_REMOVED_TEMPORARY_BUFFER**:
-  This macro is used to re-enable `get_temporary_buffer` and `return_temporary_buffer`.
-
 **_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS**:
   This macro is used to re-enable `is_literal_type`, `is_literal_type_v`,
   `result_of` and `result_of_t`.
@@ -263,7 +307,7 @@ Extensions to the C++23 modules ``std`` and ``std.compat``
 ----------------------------------------------------------
 
 Like other major implementations, libc++ provides C++23 modules ``std`` and
-``std.compat`` in C++20 as an extension.
+``std.compat`` in C++20 as an extension"
 
 Constant-initialized std::string
 --------------------------------
@@ -320,14 +364,109 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
 
-Support for compiler extensions
--------------------------------
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
+  }
 
-Clang, GCC and other compilers all provide their own set of language extensions. These extensions
-have often been developed without particular consideration for their interaction with the library,
-and as such, libc++ does not go out of its way to support them. The library may support specific
-compiler extensions which would then be documented explicitly, but the basic expectation should be
-that no special support is provided for arbitrary compiler extensions.
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
+
+.. code-block:: cpp
+
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target indirect to(squared)
+
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Pass the host function pointer to the parallel algorithm and let OpenMP
+    // translate it to the device function pointer internally
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer ``squared`` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with current versions of Clang, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
 
 Platform specific behavior
 ==========================
@@ -351,67 +490,3 @@ specific locale is imbued, the IO with the underlying stream happens with
 regular ``char`` elements, which are converted to/from wide characters
 according to the locale. Note that this doesn't behave as expected if the
 stream has been set in Unicode mode.
-
-
-Third-party Integrations
-========================
-
-Libc++ provides integration with a few third-party tools.
-
-Debugging libc++ internals in LLDB
-----------------------------------
-
-LLDB hides the implementation details of libc++ by default.
-
-E.g., when setting a breakpoint in a comparator passed to ``std::sort``, the
-backtrace will read as
-
-.. code-block::
-
-  (lldb) thread backtrace
-  * thread #1, name = 'a.out', stop reason = breakpoint 3.1
-    * frame #0: 0x000055555555520e a.out`my_comparator(a=1, b=8) at test-std-sort.cpp:6:3
-      frame #7: 0x0000555555555615 a.out`void std::__1::sort[abi:ne200000]<std::__1::__wrap_iter<int*>, bool (*)(int, int)>(__first=(item = 8), __last=(item = 0), __comp=(a.out`my_less(int, int) at test-std-sort.cpp:5)) at sort.h:1003:3
-      frame #8: 0x000055555555531a a.out`main at test-std-sort.cpp:24:3
-
-Note how the caller of ``my_comparator`` is shown as ``std::sort``. Looking at
-the frame numbers, we can see that frames #1 until #6 were hidden. Those frames
-represent internal implementation details such as ``__sort4`` and similar
-utility functions.
-
-To also show those implementation details, use ``thread backtrace -u``.
-Alternatively, to disable those compact backtraces, use ``frame recognizer list``
-and ``frame recognizer disable`` on the "libc++ frame recognizer".
-
-Futhermore, stepping into libc++ functions is disabled by default. This is controlled via the
-setting ``target.process.thread.step-avoid-regexp`` which defaults to ``^std::`` and can be
-disabled using ``settings set target.process.thread.step-avoid-regexp ""``.
-
-GDB ...
[truncated]

llvmbot · 2024-11-19T20:00:32Z

@llvm/pr-subscribers-github-workflow

Author: Vedant Tewari (xevor11)

Changes

Patch is 371.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116869.diff

36 Files Affected:

(modified) .github/workflows/libcxx-build-and-test.yaml (+1)
(modified) libcxx/CMakeLists.txt (+10-2)
(added) libcxx/cmake/caches/Generic-pstl-openmp.cmake (+1)
(modified) libcxx/docs/UserDocumentation.rst (+207-132)
(modified) libcxx/docs/VendorDocumentation.rst (+227-180)
(modified) libcxx/include/CMakeLists.txt (+1)
(modified) libcxx/include/__algorithm/ranges_find_last.h (+55)
(added) libcxx/include/__algorithm/ranges_find_last_if.h (+81)
(added) libcxx/include/__algorithm/ranges_find_last_if_not.h (+81)
(added) libcxx/include/__algorithm/ranges_shift_left.h (+74)
(added) libcxx/include/__algorithm/ranges_shift_right.h (+75)
(modified) libcxx/include/__config_site.in (+1)
(modified) libcxx/include/__pstl/backend.h (+14-14)
(modified) libcxx/include/__pstl/backend_fwd.h (+10-10)
(added) libcxx/include/__pstl/backends/openmp.h (+511)
(modified) libcxx/include/__pstl/dispatch.h (+15)
(modified) libcxx/include/module.modulemap (+2064-2236)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp (+52)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp (+67)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp (+36)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp (+39)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp (+36)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp (+49)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp (+39)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp (+63)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp (+49)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp (+21)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp (+21)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp (+21)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp (+55)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp (+41)
(added) libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp (+199)
(modified) libcxx/utils/ci/run-buildbot (+166-176)
(modified) libcxx/utils/libcxx/test/features.py (+104-178)
(modified) libcxx/utils/run.py (+15)
(modified) libcxxabi/CMakeLists.txt (+8)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 2184ddd49537b5..9e483612bc9943 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -158,6 +158,7 @@ jobs:
           'generic-no-wide-characters',
           'generic-no-rtti',
           'generic-optimized-speed',
+          'generic-pstl-openmp',
           'generic-static',
           'bootstrapping-build'
         ]
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index abe12c2805a7cf..dee2a75f74d89f 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -300,10 +300,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
 if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
 else()
-  set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
 
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -552,6 +553,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -784,6 +790,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
   config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
 elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std_thread and libdispatch")
diff --git a/libcxx/cmake/caches/Generic-pstl-openmp.cmake b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
new file mode 100644
index 00000000000000..f3ff4f3b57fd21
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
diff --git a/libcxx/docs/UserDocumentation.rst b/libcxx/docs/UserDocumentation.rst
index 2c1bc1373659c3..f1e7b19ead5798 100644
--- a/libcxx/docs/UserDocumentation.rst
+++ b/libcxx/docs/UserDocumentation.rst
@@ -1,17 +1,19 @@
-.. _user-documentation:
+.. _using-libcxx:
 
-==================
-User documentation
-==================
+============
+Using libc++
+============
 
 .. contents::
   :local:
 
+Usually, libc++ is packaged and shipped by a vendor through some delivery vehicle
+(operating system distribution, SDK, toolchain, etc) and users don't need to do
+anything special in order to use the library.
+
 This page contains information about configuration knobs that can be used by
 users when they know libc++ is used by their toolchain, and how to use libc++
-when it is not the default library used by their toolchain. It is aimed at
-users of libc++: a separate page contains documentation aimed at vendors who
-build and ship libc++ as part of their toolchain.
+when it is not the default library used by their toolchain.
 
 
 Using a different version of the C++ Standard
@@ -26,29 +28,10 @@ matches that Standard in the library.
 
   $ clang++ -std=c++17 test.cpp
 
-Note that using ``-std=c++XY`` with a version of the Standard that has not been ratified
-yet is considered unstable. While we strive to maintain stability, libc++ may be forced to
-make breaking changes to features shipped in a Standard that hasn't been ratified yet. Use
-these versions of the Standard at your own risk.
-
-
-Using libc++ when it is not the system default
-==============================================
-
-Usually, libc++ is packaged and shipped by a vendor through some delivery vehicle
-(operating system distribution, SDK, toolchain, etc) and users don't need to do
-anything special in order to use the library.
-
-On systems where libc++ is provided but is not the default, Clang provides a flag
-called ``-stdlib=`` that can be used to decide which standard library is used.
-Using ``-stdlib=libc++`` will select libc++:
-
-.. code-block:: bash
-
-  $ clang++ -stdlib=libc++ test.cpp
-
-On systems where libc++ is the library in use by default such as macOS and FreeBSD,
-this flag is not required.
+.. warning::
+  Using ``-std=c++XY`` with a version of the Standard that has not been ratified yet
+  is considered unstable. Libc++ reserves the right to make breaking changes to the
+  library until the standard has been ratified.
 
 
 Enabling experimental C++ Library features
@@ -60,19 +43,15 @@ the Standard but whose implementation is not complete or stable yet in libc++. T
 are disabled by default because they are neither API nor ABI stable. However, the
 ``-fexperimental-library`` compiler flag can be defined to turn those features on.
 
-On compilers that do not support the ``-fexperimental-library`` flag (such as GCC),
-users can define the ``_LIBCPP_ENABLE_EXPERIMENTAL`` macro and manually link against
-the appropriate static library (usually shipped as ``libc++experimental.a``) to get
-access to experimental library features.
-
 The following features are currently considered experimental and are only provided
 when ``-fexperimental-library`` is passed:
 
 * The parallel algorithms library (``<execution>`` and the associated algorithms)
+* ``std::stop_token``, ``std::stop_source`` and ``std::stop_callback``
+* ``std::jthread``
 * ``std::chrono::tzdb`` and related time zone functionality
-* ``<syncstream>``
 
-.. note::
+.. warning::
   Experimental libraries are experimental.
     * The contents of the ``<experimental/...>`` headers and the associated static
       library will not remain compatible between versions.
@@ -81,18 +60,98 @@ when ``-fexperimental-library`` is passed:
       the experimental feature is removed two releases after the non-experimental
       version has shipped. The full policy is explained :ref:`here <experimental features>`.
 
+.. note::
+  On compilers that do not support the ``-fexperimental-library`` flag, users can
+  define the ``_LIBCPP_ENABLE_EXPERIMENTAL`` macro and manually link against the
+  appropriate static library (usually shipped as ``libc++experimental.a``) to get
+  access to experimental library features.
 
-Libc++ Configuration Macros
+
+Using libc++ when it is not the system default
+==============================================
+
+On systems where libc++ is provided but is not the default, Clang provides a flag
+called ``-stdlib=`` that can be used to decide which standard library is used.
+Using ``-stdlib=libc++`` will select libc++:
+
+.. code-block:: bash
+
+  $ clang++ -stdlib=libc++ test.cpp
+
+On systems where libc++ is the library in use by default such as macOS and FreeBSD,
+this flag is not required.
+
+
+.. _alternate libcxx:
+
+Using a custom built libc++
 ===========================
 
-Libc++ provides a number of configuration macros that can be used by developers to
-enable or disable extended libc++ behavior.
+Most compilers provide a way to disable the default behavior for finding the
+standard library and to override it with custom paths. With Clang, this can
+be done with:
 
-.. warning::
-  Configuration macros that are not documented here are not intended to be customized
-  by developers and should not be used. In particular, some configuration macros are
-  only intended to be used by vendors and changing their value from the one provided
-  in your toolchain can lead to unexpected behavior.
+.. code-block:: bash
+
+  $ clang++ -nostdinc++ -nostdlib++           \
+            -isystem <install>/include/c++/v1 \
+            -L <install>/lib                  \
+            -Wl,-rpath,<install>/lib          \
+            -lc++                             \
+            test.cpp
+
+The option ``-Wl,-rpath,<install>/lib`` adds a runtime library search path,
+which causes the system's dynamic linker to look for libc++ in ``<install>/lib``
+whenever the program is loaded.
+
+GCC does not support the ``-nostdlib++`` flag, so one must use ``-nodefaultlibs``
+instead. Since that removes all the standard system libraries and not just libc++,
+the system libraries must be re-added manually. For example:
+
+.. code-block:: bash
+
+  $ g++ -nostdinc++ -nodefaultlibs           \
+        -isystem <install>/include/c++/v1    \
+        -L <install>/lib                     \
+        -Wl,-rpath,<install>/lib             \
+        -lc++ -lc++abi -lm -lc -lgcc_s -lgcc \
+        test.cpp
+
+
+GDB Pretty printers for libc++
+==============================
+
+GDB does not support pretty-printing of libc++ symbols by default. However, libc++ does
+provide pretty-printers itself. Those can be used as:
+
+.. code-block:: bash
+
+  $ gdb -ex "source <libcxx>/utils/gdb/libcxx/printers.py" \
+        -ex "python register_libcxx_printer_loader()" \
+        <args>
+
+.. _include-what-you-use:
+
+include-what-you-use (IWYU)
+===========================
+
+libc++ provides an IWYU `mapping file <https://github.com/include-what-you-use/include-what-you-use/blob/master/docs/IWYUMappings.md>`_,
+which drastically improves the accuracy of the tool when using libc++. To use the mapping file with
+IWYU, you should run the tool like so:
+
+.. code-block:: bash
+
+  $ include-what-you-use -Xiwyu --mapping_file=/path/to/libcxx/include/libcxx.imp file.cpp
+
+If you would prefer to not use that flag, then you can replace ``/path/to/include-what-you-use/share/libcxx.imp``
+file with the libc++-provided ``libcxx.imp`` file.
+
+Libc++ Configuration Macros
+===========================
+
+Libc++ provides a number of configuration macros which can be used to enable
+or disable extended libc++ behavior, including enabling hardening or thread
+safety annotations.
 
 **_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS**:
   This macro is used to enable -Wthread-safety annotations on libc++'s
@@ -134,12 +193,6 @@ enable or disable extended libc++ behavior.
   warning saying that `std::auto_ptr` is deprecated. If the macro is defined,
   no warning will be emitted. By default, this macro is not defined.
 
-**_LIBCPP_ENABLE_EXPERIMENTAL**:
-  This macro enables experimental features. This can be used on compilers that do
-  not support the ``-fexperimental-library`` flag. When used, users also need to
-  ensure that the appropriate experimental library (usually ``libc++experimental.a``)
-  is linked into their program.
-
 C++17 Specific Configuration Macros
 -----------------------------------
 **_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR**:
@@ -156,18 +209,12 @@ C++17 Specific Configuration Macros
 **_LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE**:
   This macro is used to re-enable the `random_shuffle` algorithm.
 
-**_LIBCPP_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION**:
-  This macro is used to re-enable `unary_function` and `binary_function`.
-
 **_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS**:
   This macro is used to re-enable `set_unexpected`, `get_unexpected`, and
   `unexpected`.
 
 C++20 Specific Configuration Macros
 -----------------------------------
-**_LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION**:
-  This macro is used to re-enable `uncaught_exception`.
-
 **_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE**:
   This macro is used to re-enable the function
   ``std::shared_ptr<...>::unique()``.
@@ -184,9 +231,6 @@ C++20 Specific Configuration Macros
 **_LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR**:
   This macro is used to re-enable `raw_storage_iterator`.
 
-**_LIBCPP_ENABLE_CXX20_REMOVED_TEMPORARY_BUFFER**:
-  This macro is used to re-enable `get_temporary_buffer` and `return_temporary_buffer`.
-
 **_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS**:
   This macro is used to re-enable `is_literal_type`, `is_literal_type_v`,
   `result_of` and `result_of_t`.
@@ -263,7 +307,7 @@ Extensions to the C++23 modules ``std`` and ``std.compat``
 ----------------------------------------------------------
 
 Like other major implementations, libc++ provides C++23 modules ``std`` and
-``std.compat`` in C++20 as an extension.
+``std.compat`` in C++20 as an extension"
 
 Constant-initialized std::string
 --------------------------------
@@ -320,14 +364,109 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
 
-Support for compiler extensions
--------------------------------
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
+  }
 
-Clang, GCC and other compilers all provide their own set of language extensions. These extensions
-have often been developed without particular consideration for their interaction with the library,
-and as such, libc++ does not go out of its way to support them. The library may support specific
-compiler extensions which would then be documented explicitly, but the basic expectation should be
-that no special support is provided for arbitrary compiler extensions.
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
+
+.. code-block:: cpp
+
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target indirect to(squared)
+
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Pass the host function pointer to the parallel algorithm and let OpenMP
+    // translate it to the device function pointer internally
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer ``squared`` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with current versions of Clang, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
 
 Platform specific behavior
 ==========================
@@ -351,67 +490,3 @@ specific locale is imbued, the IO with the underlying stream happens with
 regular ``char`` elements, which are converted to/from wide characters
 according to the locale. Note that this doesn't behave as expected if the
 stream has been set in Unicode mode.
-
-
-Third-party Integrations
-========================
-
-Libc++ provides integration with a few third-party tools.
-
-Debugging libc++ internals in LLDB
-----------------------------------
-
-LLDB hides the implementation details of libc++ by default.
-
-E.g., when setting a breakpoint in a comparator passed to ``std::sort``, the
-backtrace will read as
-
-.. code-block::
-
-  (lldb) thread backtrace
-  * thread #1, name = 'a.out', stop reason = breakpoint 3.1
-    * frame #0: 0x000055555555520e a.out`my_comparator(a=1, b=8) at test-std-sort.cpp:6:3
-      frame #7: 0x0000555555555615 a.out`void std::__1::sort[abi:ne200000]<std::__1::__wrap_iter<int*>, bool (*)(int, int)>(__first=(item = 8), __last=(item = 0), __comp=(a.out`my_less(int, int) at test-std-sort.cpp:5)) at sort.h:1003:3
-      frame #8: 0x000055555555531a a.out`main at test-std-sort.cpp:24:3
-
-Note how the caller of ``my_comparator`` is shown as ``std::sort``. Looking at
-the frame numbers, we can see that frames #1 until #6 were hidden. Those frames
-represent internal implementation details such as ``__sort4`` and similar
-utility functions.
-
-To also show those implementation details, use ``thread backtrace -u``.
-Alternatively, to disable those compact backtraces, use ``frame recognizer list``
-and ``frame recognizer disable`` on the "libc++ frame recognizer".
-
-Futhermore, stepping into libc++ functions is disabled by default. This is controlled via the
-setting ``target.process.thread.step-avoid-regexp`` which defaults to ``^std::`` and can be
-disabled using ``settings set target.process.thread.step-avoid-regexp ""``.
-
-GDB ...
[truncated]

xevor11 · 2024-11-24T17:45:35Z

For the module.modulemap file do we go with the changes in main or in #66968 @jdoerfert

jdoerfert · 2024-11-25T16:45:14Z

For the module.modulemap file do we go with the changes in main or in #66968 @jdoerfert

Apply the change manually to main, thus, take the module.modulemap file from main, and then add

module std_private_pstl_backends_openmp            [system] {
  header "__pstl/backends/openmp.h"
  export *
}

or whatever the path now looks like, to the main module.map

Does it pass the libcxx tests locally?

jdoerfert · 2024-11-25T16:47:18Z

Can you split your 2 additions off this PR, please.
We need one PR which is basically a rebased #66968, either on top of #66968 or here.
And one PR per new feature/algorithm class you add.

vidsinghal · 2024-12-09T01:02:22Z

Hello , I am having a difficult time to compile the tests in the PR.

I build this PR code with the following command

cmake -G Ninja -DCMAKE_BUILD_TYPE=Release \
          -DCMAKE_C_COMPILER=gcc \
          -DCMAKE_CXX_COMPILER=g++ \
          -DCMAKE_INSTALL_PREFIX="$INSTALLDIR" \
          -DLIBCXX_ENABLE_WERROR=YES \
          -DLIBCXXABI_ENABLE_WERROR=YES \
          -DLIBUNWIND_ENABLE_WERROR=YES \
          -DLIBCXX_ENABLE_CLANG_TIDY=ON \
          -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
          -DLLVM_ENABLE_PROJECTS="clang;lld;" -DCLANG_DEFAULT_LINKER="lld" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;offload;pstl;openmp" -DLIBCXX_PSTL_BACKEND="openmp" -DLLVM_TARGETS_TO_BUILD="X86;AMDGPU" \
          -DPSTL_PARALLEL_BACKEND="omp" -DLIBCXX_CXX_ABI=libcxxabi -DLIBCXX_ENABLE_THREADS=ON \
                $CLANG_ROOT/llvm-project/llvm

I am trying to compile of the tests with the following command:

clang++  -Wl,-rpath,../clang/build/lib  -I ../clang/build/projects/runtimes/src/  -I ../clang/build/runtimes/runtimes-bins/openmp/runtime/src/  -fopenmp -fexperimental-library -fopenmp-targets=amdgcn-amd-amdhsa test.cpp -o test

however, it throws an error about tbb, which is interesting, since i specifically compiled for the Openmp backend.

ld.lld: error: undefined symbol: tbb::interface7::internal::isolate_within_arena(tbb::interface7::internal::delegate_base&, long)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(__pstl::execution::v1::parallel_unsequenced_policy const& tbb::interface7::internal::isolate_impl<void, void __pstl::__tbb_backend::__parallel_for<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0)::'lambda'() const>(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>&))

ld.lld: error: undefined symbol: tbb::task_group_context::~task_group_context()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>::run(tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>> const&, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)> const&, tbb::auto_partitioner const&))
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>::run(tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>> const&, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)> const&, tbb::auto_partitioner const&))

ld.lld: error: undefined symbol: tbb::task_group_context::init()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task_group_context::task_group_context(tbb::task_group_context::kind_type, unsigned long))

ld.lld: error: undefined symbol: tbb::internal::allocate_root_with_context_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator new(unsigned long, tbb::internal::allocate_root_with_context_proxy const&))

ld.lld: error: undefined symbol: tbb::internal::allocate_root_with_context_proxy::free(tbb::task&) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator delete(void*, tbb::internal::allocate_root_with_context_proxy const&))

ld.lld: error: undefined symbol: vtable for tbb::task
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task::task())
>>> the vtable symbol may be undefined because the class is missing its key function (see https://lld.llvm.org/missingkeyfunction)

ld.lld: error: undefined symbol: tbb::internal::get_initial_auto_partitioner_divisor()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::adaptive_mode<tbb::interface9::internal::auto_partition_type>::adaptive_mode())

ld.lld: error: undefined symbol: tbb::internal::allocate_child_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::allocate_sibling(tbb::task*, unsigned long))

ld.lld: error: undefined symbol: tbb::internal::allocate_continuation_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator new(unsigned long, tbb::internal::allocate_continuation_proxy const&))

ld.lld: error: undefined symbol: tbb::internal::allocate_continuation_proxy::free(tbb::task&) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator delete(void*, tbb::internal::allocate_continuation_proxy const&))

ld.lld: error: undefined symbol: tbb::task_group_context::is_group_execution_cancelled() const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task::is_cancelled() const)

ld.lld: error: undefined symbol: typeinfo for tbb::task
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(typeinfo for tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(typeinfo for tbb::interface9::internal::flag_task)

ld.lld: error: undefined symbol: tbb::task::note_affinity(unsigned short)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(vtable for tbb::interface9::internal::flag_task)
/usr/bin/clang-linker-wrapper: error: 'ld.lld' failed
clang++: error: linker command failed with exit code 1 (use -v to see invocation)

I can fix this error if i add -ltbb to the end of the compile command.
But then it is never offloaded.
although if i add save temps i do see files compiled for the amd gpu target.

Do you have any clue what is happening here?

In, the
host-x86_64-unknown-linux-gnu.s file, it is making calls to the tbb version of the backend, instead of the openmp version _ZNK6__pstl13__tbb_backend19__parallel_for

AntonRydahl and others added 2 commits November 14, 2024 15:47

Adding OpenMP Offloading Backend for C++ Parallel Algorithms Rebased

5c8fefb

Algorithms for ranges shift left and shift right

b13df77

xevor11 requested review from a team as code owners November 19, 2024 19:59

llvmbot added libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi. libc++abi libc++abi C++ Runtime Library. Not libc++. github:workflow labels Nov 19, 2024

xevor11 assigned ldionne, AntonRydahl and jdoerfert Nov 19, 2024

Algorithms for ranges_find_last, find_last_if, and find_last_if_not

7795436

xevor11 force-pushed the libcxx_pstl_omp_offload_backend_test branch from faf7890 to 7795436 Compare November 24, 2024 17:41

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Adding Support for Offloading C++ standard algorithms #116869

Adding Support for Offloading C++ standard algorithms #116869

xevor11 commented Nov 19, 2024 •

edited

Loading

Uh oh!

github-actions bot commented Nov 19, 2024

Uh oh!

llvmbot commented Nov 19, 2024 •

edited

Loading

Uh oh!

llvmbot commented Nov 19, 2024

Uh oh!

xevor11 commented Nov 24, 2024

Uh oh!

jdoerfert commented Nov 25, 2024

Uh oh!

jdoerfert commented Nov 25, 2024

Uh oh!

vidsinghal commented Dec 9, 2024 •

edited

Loading

Uh oh!

Uh oh!

Adding Support for Offloading C++ standard algorithms #116869

Are you sure you want to change the base?

Adding Support for Offloading C++ standard algorithms #116869

Conversation

xevor11 commented Nov 19, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Nov 19, 2024

Uh oh!

llvmbot commented Nov 19, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 19, 2024

Uh oh!

xevor11 commented Nov 24, 2024

Uh oh!

jdoerfert commented Nov 25, 2024

Uh oh!

jdoerfert commented Nov 25, 2024

Uh oh!

vidsinghal commented Dec 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

xevor11 commented Nov 19, 2024 •

edited

Loading

llvmbot commented Nov 19, 2024 •

edited

Loading

vidsinghal commented Dec 9, 2024 •

edited

Loading