Skip to content

[RFC] Offloading C++ standard parallel algorithms to GPUs using OpenMP #66465

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Index, class _DifferenceType, class _Tp>
_LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
_PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
_PSTL_PRAGMA_SIMD
_PSTL_OMP_MAP_TO(__first, __n);
# pragma omp target enter data map(to : __value)
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__first[__i] = __value;
_PSTL_OMP_MAP_FROM(__first, __n);
return __first + __n;
}

/**
* Specialization for std::vector where the base pointer must be extrated to map
* the data to and from the GPU.
*/

template <typename T, class _DifferenceType, class _Tp>
_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*>
__simd_fill_n(std::__wrap_iter<T*> __first, _DifferenceType __n, const _Tp& __value) noexcept {
_PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
_PSTL_OMP_MAP_TO(__first, __n);
// For std::vector the base pointer of the data buffer needs to be extracted
std::pointer_traits<std::__wrap_iter<T*>> PT;
T* data = PT.to_address(__first);
# pragma omp target enter data map(to : __value)
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
data[__i] = __value;
_PSTL_OMP_MAP_FROM(__first, __n);
return __first + __n;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD

template <class _Iterator, class _DifferenceType, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
_PSTL_PRAGMA_SIMD
_PSTL_OMP_MAP_TO(__first, __n);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first[__i]);
_PSTL_OMP_MAP_FROM(__first, __n);
return __first + __n;
}

/**
* Specialization for std::vector where the base pointer must be extrated to map
* the data to and from the GPU.
*/

template <typename T, class _DifferenceType, class _Function>
_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*>
__simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
_PSTL_OMP_MAP_TO(__first, __n);
// For std::vector the base pointer of the data buffer needs to be extracted
std::pointer_traits<std::__wrap_iter<T*>> PT;
T* data = PT.to_address(__first);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(data[__i]);
_PSTL_OMP_MAP_FROM(__first, __n);
return __first + __n;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator2
__simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
_PSTL_PRAGMA_SIMD
_PSTL_OMP_MAP_TO(__first1, __n);
_PSTL_OMP_MAP_ALLOC(__first2, __n);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first1[__i], __first2[__i]);
_PSTL_OMP_MAP_FROM(__first2, __n);
return __first2 + __n;
}

/**
* Specialization for std::vector where the base pointer must be extrated to map
* the data to and from the GPU.
*/

template <typename T1, class _DifferenceType, typename T2, class _Function>
_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*> __simd_walk_2(
std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
_PSTL_OMP_MAP_TO(__first1, __n);
_PSTL_OMP_MAP_ALLOC(__first2, __n);
std::pointer_traits<std::__wrap_iter<T1*>> PT1;
std::pointer_traits<std::__wrap_iter<T2*>> PT2;
T1* __data1 = PT1.to_address(__first1);
T2* __data2 = PT2.to_address(__first2);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__data1[__i], __data2[__i]);
_PSTL_OMP_MAP_FROM(__first2, __n);
return __first2 + __n;
}

Expand Down Expand Up @@ -72,9 +96,43 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3(
_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
_PSTL_PRAGMA_SIMD
_PSTL_OMP_MAP_TO(__first1, __n);
_PSTL_OMP_MAP_TO(__first2, __n);
_PSTL_OMP_MAP_TO(__first3, __n);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first1[__i], __first2[__i], __first3[__i]);
_PSTL_OMP_MAP_FROM(__first2, __n);
_PSTL_OMP_MAP_FROM(__first3, __n);
return __first3 + __n;
}

/**
* Specialization for std::vector where the base pointer must be extrated to map
* the data to and from the GPU.
*/

template <typename T1, class _DifferenceType, typename T2, typename T3, class _Function>
_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T3*>
__simd_walk_3(std::__wrap_iter<T1*> __first1,
_DifferenceType __n,
std::__wrap_iter<T2*> __first2,
std::__wrap_iter<T3*> __first3,
_Function __f) noexcept {
_PSTL_OMP_MAP_TO(__first1, __n);
_PSTL_OMP_MAP_TO(__first2, __n);
_PSTL_OMP_MAP_TO(__first3, __n);
std::pointer_traits<std::__wrap_iter<T1*>> PT1;
std::pointer_traits<std::__wrap_iter<T2*>> PT2;
std::pointer_traits<std::__wrap_iter<T3*>> PT3;
T1* __data1 = PT1.to_address(__first1);
T2* __data2 = PT2.to_address(__first2);
T3* __data3 = PT3.to_address(__first3);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__data1[__i], __data2[__i], __data3[__i]);
_PSTL_OMP_MAP_FROM(__first2, __n);
_PSTL_OMP_MAP_FROM(__first3, __n);
return __first3 + __n;
}
template <class _ExecutionPolicy,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,21 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
_Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);

// initializer
_PSTL_PRAGMA_SIMD
_PSTL_PRAGMA_SIMD()
for (_Size __i = 0; __i < __block_size; ++__i) {
::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
}
// main loop
_Size __i = 2 * __block_size;
const _Size __last_iteration = __block_size * (__n / __block_size);
for (; __i < __last_iteration; __i += __block_size) {
_PSTL_PRAGMA_SIMD
_PSTL_PRAGMA_SIMD()
for (_Size __j = 0; __j < __block_size; ++__j) {
__lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j));
}
}
// remainder
_PSTL_PRAGMA_SIMD
_PSTL_PRAGMA_SIMD()
for (_Size __j = 0; __j < __n - __last_iteration; ++__j) {
__lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j));
}
Expand All @@ -80,7 +80,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
__init = __binary_op(std::move(__init), std::move(__lane[__j]));
}
// destroyer
_PSTL_PRAGMA_SIMD
_PSTL_PRAGMA_SIMD()
for (_Size __j = 0; __j < __block_size; ++__j) {
__lane[__j].~_Tp();
}
Expand Down
143 changes: 140 additions & 3 deletions libcxx/include/__config
Original file line number Diff line number Diff line change
Expand Up @@ -1414,8 +1414,140 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
// Enable SIMD for compilers that support OpenMP 4.0
# if (defined(_OPENMP) && _OPENMP >= 201307)

# ifdef _LIBCPP_ENABLE_OPENMP_OFFLOAD
# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES
# define _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES 32768
# endif
#include <omp.h>
#include <__iterator/wrap_iter.h>
# define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN]))
# define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN]))
# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target exit data map(from:NAME[:LEN]))

template <typename N>
bool constexpr OMPIsOffloadable(N size)
{
return size >= _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES;
}

bool constexpr OMPIsOffloadable(void)
{
return false;
}

template <typename T, typename N>
void inline OMPMapToIf(T data,N length,int device = omp_get_default_device())
{
// If the data is already present on the device, there is no need
// transfer the data again.
#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED
if (omp_target_is_present(data,device)){
return;
}
#endif
// If it is a small amount of data it does not make sense to offload to a
// device
if (!OMPIsOffloadable(length)){
return;
}
_PSTL_PRAGMA_DATA_MAP_TO(data,length);
}

template <typename T, typename N>
void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device())
{
// If the data is already present on the device, there is no need
// transfer the data again.
#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED
if (omp_target_is_present(data,device)){
return;
}
#endif
// If it is a small amount of data it does not make sense to offload to a
// device
if (!OMPIsOffloadable(length)){
return;
}
_PSTL_PRAGMA_DATA_MAP_ALLOC(data,length);
}

template <typename T, typename N>
void inline OMPMapFromIf(T data,N length,int device = omp_get_default_device())
{
// If the data is already present on the device, there is no need
// transfer the data again.
#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED
if (omp_target_is_present(data,device)){
return;
}
#endif
// If it is a small amount of data it does not make sense to offload to a
// device
if (!OMPIsOffloadable(length)){
return;
}
_PSTL_PRAGMA_DATA_MAP_FROM(data,length);
}

template <typename T, typename N>
void inline OMPMapTo(T data,N length) {
OMPMapToIf(data,length);
}

/**
* Specialization for std::vector
*/

template <typename T,typename N>
void inline OMPMapTo(std::__wrap_iter<T*> w,N length) {
std::pointer_traits<std::__wrap_iter<T*>> PT;
T* data = PT.to_address(w);
OMPMapToIf(data,length);
}

template <typename T, typename N>
void inline OMPMapAlloc(T data,N length) {
OMPMapAllocIf(data,length);
}

/**
* Specialization for std::vector
*/

template <typename T,typename N>
void inline OMPMapAlloc(std::__wrap_iter<T*> w,N length) {
std::pointer_traits<std::__wrap_iter<T*>> PT;
T* data = PT.to_address(w);
OMPMapAllocIf(data,length);
}

template <typename T,typename N>
void inline OMPMapFrom(T data,N length) {
OMPMapFromIf(data,length);
}

/**
* Specialization for std::vector
*/

template <typename T,typename N>
void inline OMPMapFrom(std::__wrap_iter<T*> w,N length) {
std::pointer_traits<std::__wrap_iter<T*>> PT;
T* data = PT.to_address(w);
OMPMapFromIf(data,length);
}
# define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN)
# define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN)
# define _PSTL_OMP_MAP_FROM(DATA,LEN) OMPMapFrom(DATA,LEN)
# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp target teams distribute parallel for simd if(OMPIsOffloadable(__VA_ARGS__)))
# else
# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp simd)
# define _PSTL_OMP_MAP_TO(DATA,LEN)
# define _PSTL_OMP_MAP_ALLOC(DATA,LEN)
# define _PSTL_OMP_MAP_FROM(DATA,LEN)
# endif

# define _PSTL_UDR_PRESENT
# define _PSTL_PRAGMA_SIMD _PSTL_PRAGMA(omp simd)
# define _PSTL_PRAGMA_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd)
# define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _PSTL_PRAGMA(omp simd reduction(PRM))
# define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd reduction(inscan, PRM))
Expand All @@ -1434,7 +1566,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c

# elif defined(_LIBCPP_COMPILER_CLANG_BASED)

# define _PSTL_PRAGMA_SIMD _Pragma("clang loop vectorize(enable) interleave(enable)")
# define _PSTL_PRAGMA_SIMD(...) _Pragma("clang loop vectorize(enable) interleave(enable)")
# define _PSTL_PRAGMA_DECLARE_SIMD
# define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
# define _PSTL_PRAGMA_SIMD_SCAN(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
Expand All @@ -1444,7 +1576,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c

# else // (defined(_OPENMP) && _OPENMP >= 201307)

# define _PSTL_PRAGMA_SIMD
# define _PSTL_PRAGMA_SIMD(...)
# define _PSTL_PRAGMA_DECLARE_SIMD
# define _PSTL_PRAGMA_SIMD_REDUCTION(PRM)
# define _PSTL_PRAGMA_SIMD_SCAN(PRM)
Expand All @@ -1454,6 +1586,11 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c

# endif // (defined(_OPENMP) && _OPENMP >= 201307)

# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD
# define _PSTL_OMP_MAP_TO(...)
# define _PSTL_OMP_MAP_FROM(...)
# endif

# define _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED

#endif // __cplusplus
Expand Down