diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h index 8b531887c7318..b7e12adec1e81 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h @@ -27,9 +27,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED - _PSTL_PRAGMA_SIMD + _PSTL_OMP_MAP_TO(__first, __n); +# pragma omp target enter data map(to : __value) + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __first[__i] = __value; + _PSTL_OMP_MAP_FROM(__first, __n); + return __first + __n; +} + +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. + */ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter +__simd_fill_n(std::__wrap_iter __first, _DifferenceType __n, const _Tp& __value) noexcept { + _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED + _PSTL_OMP_MAP_TO(__first, __n); + // For std::vector the base pointer of the data buffer needs to be extracted + std::pointer_traits> PT; + T* data = PT.to_address(__first); +# pragma omp target enter data map(to : __value) + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + data[__i] = __value; + _PSTL_OMP_MAP_FROM(__first, __n); return __first + __n; } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h index f6f22fdd8713c..ed336766295cb 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h @@ -26,10 +26,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { - _PSTL_PRAGMA_SIMD + _PSTL_OMP_MAP_TO(__first, __n); + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first[__i]); + _PSTL_OMP_MAP_FROM(__first, __n); + return __first + __n; +} +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. + */ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter +__simd_walk_1(std::__wrap_iter __first, _DifferenceType __n, _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first, __n); + // For std::vector the base pointer of the data buffer needs to be extracted + std::pointer_traits> PT; + T* data = PT.to_address(__first); + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(data[__i]); + _PSTL_OMP_MAP_FROM(__first, __n); return __first + __n; } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h index 0259d8a84bb3f..e6cd70b5a420b 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h @@ -30,9 +30,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Iterator2 __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept { - _PSTL_PRAGMA_SIMD + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_ALLOC(__first2, __n); + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first1[__i], __first2[__i]); + _PSTL_OMP_MAP_FROM(__first2, __n); + return __first2 + __n; +} + +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. + */ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter __simd_walk_2( + std::__wrap_iter __first1, _DifferenceType __n, std::__wrap_iter __first2, _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_ALLOC(__first2, __n); + std::pointer_traits> PT1; + std::pointer_traits> PT2; + T1* __data1 = PT1.to_address(__first1); + T2* __data2 = PT2.to_address(__first2); + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__data1[__i], __data2[__i]); + _PSTL_OMP_MAP_FROM(__first2, __n); return __first2 + __n; } @@ -72,9 +96,43 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform( template _LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3( _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept { - _PSTL_PRAGMA_SIMD + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_TO(__first2, __n); + _PSTL_OMP_MAP_TO(__first3, __n); + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first1[__i], __first2[__i], __first3[__i]); + _PSTL_OMP_MAP_FROM(__first2, __n); + _PSTL_OMP_MAP_FROM(__first3, __n); + return __first3 + __n; +} + +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. + */ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter +__simd_walk_3(std::__wrap_iter __first1, + _DifferenceType __n, + std::__wrap_iter __first2, + std::__wrap_iter __first3, + _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_TO(__first2, __n); + _PSTL_OMP_MAP_TO(__first3, __n); + std::pointer_traits> PT1; + std::pointer_traits> PT2; + std::pointer_traits> PT3; + T1* __data1 = PT1.to_address(__first1); + T2* __data2 = PT2.to_address(__first2); + T3* __data3 = PT3.to_address(__first3); + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__data1[__i], __data2[__i], __data3[__i]); + _PSTL_OMP_MAP_FROM(__first2, __n); + _PSTL_OMP_MAP_FROM(__first3, __n); return __first3 + __n; } template (__lane_buffer); // initializer - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __i = 0; __i < __block_size; ++__i) { ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i))); } @@ -65,13 +65,13 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un _Size __i = 2 * __block_size; const _Size __last_iteration = __block_size * (__n / __block_size); for (; __i < __last_iteration; __i += __block_size) { - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __j = 0; __j < __block_size; ++__j) { __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j)); } } // remainder - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __j = 0; __j < __n - __last_iteration; ++__j) { __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j)); } @@ -80,7 +80,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un __init = __binary_op(std::move(__init), std::move(__lane[__j])); } // destroyer - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __j = 0; __j < __block_size; ++__j) { __lane[__j].~_Tp(); } diff --git a/libcxx/include/__config b/libcxx/include/__config index bf2564e2732ba..f4e5c511cdf67 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1414,8 +1414,140 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c // Enable SIMD for compilers that support OpenMP 4.0 # if (defined(_OPENMP) && _OPENMP >= 201307) +# ifdef _LIBCPP_ENABLE_OPENMP_OFFLOAD +# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES +# define _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES 32768 +# endif +#include +#include <__iterator/wrap_iter.h> +# define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN])) +# define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN])) +# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target exit data map(from:NAME[:LEN])) + +template +bool constexpr OMPIsOffloadable(N size) +{ + return size >= _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES; +} + +bool constexpr OMPIsOffloadable(void) +{ + return false; +} + +template +void inline OMPMapToIf(T data,N length,int device = omp_get_default_device()) +{ + // If the data is already present on the device, there is no need + // transfer the data again. +#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED + if (omp_target_is_present(data,device)){ + return; + } +#endif + // If it is a small amount of data it does not make sense to offload to a + // device + if (!OMPIsOffloadable(length)){ + return; + } + _PSTL_PRAGMA_DATA_MAP_TO(data,length); +} + +template +void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device()) +{ + // If the data is already present on the device, there is no need + // transfer the data again. +#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED + if (omp_target_is_present(data,device)){ + return; + } +#endif + // If it is a small amount of data it does not make sense to offload to a + // device + if (!OMPIsOffloadable(length)){ + return; + } + _PSTL_PRAGMA_DATA_MAP_ALLOC(data,length); +} + +template +void inline OMPMapFromIf(T data,N length,int device = omp_get_default_device()) +{ + // If the data is already present on the device, there is no need + // transfer the data again. +#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED + if (omp_target_is_present(data,device)){ + return; + } +#endif + // If it is a small amount of data it does not make sense to offload to a + // device + if (!OMPIsOffloadable(length)){ + return; + } + _PSTL_PRAGMA_DATA_MAP_FROM(data,length); +} + +template +void inline OMPMapTo(T data,N length) { + OMPMapToIf(data,length); +} + +/** + * Specialization for std::vector +*/ + +template +void inline OMPMapTo(std::__wrap_iter w,N length) { + std::pointer_traits> PT; + T* data = PT.to_address(w); + OMPMapToIf(data,length); +} + +template +void inline OMPMapAlloc(T data,N length) { + OMPMapAllocIf(data,length); +} + +/** + * Specialization for std::vector +*/ + +template +void inline OMPMapAlloc(std::__wrap_iter w,N length) { + std::pointer_traits> PT; + T* data = PT.to_address(w); + OMPMapAllocIf(data,length); +} + +template +void inline OMPMapFrom(T data,N length) { + OMPMapFromIf(data,length); +} + +/** + * Specialization for std::vector +*/ + +template +void inline OMPMapFrom(std::__wrap_iter w,N length) { + std::pointer_traits> PT; + T* data = PT.to_address(w); + OMPMapFromIf(data,length); +} +# define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN) +# define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN) +# define _PSTL_OMP_MAP_FROM(DATA,LEN) OMPMapFrom(DATA,LEN) +# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp target teams distribute parallel for simd if(OMPIsOffloadable(__VA_ARGS__))) +# else +# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp simd) +# define _PSTL_OMP_MAP_TO(DATA,LEN) +# define _PSTL_OMP_MAP_ALLOC(DATA,LEN) +# define _PSTL_OMP_MAP_FROM(DATA,LEN) +# endif + # define _PSTL_UDR_PRESENT -# define _PSTL_PRAGMA_SIMD _PSTL_PRAGMA(omp simd) # define _PSTL_PRAGMA_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd) # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _PSTL_PRAGMA(omp simd reduction(PRM)) # define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd reduction(inscan, PRM)) @@ -1434,7 +1566,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c # elif defined(_LIBCPP_COMPILER_CLANG_BASED) -# define _PSTL_PRAGMA_SIMD _Pragma("clang loop vectorize(enable) interleave(enable)") +# define _PSTL_PRAGMA_SIMD(...) _Pragma("clang loop vectorize(enable) interleave(enable)") # define _PSTL_PRAGMA_DECLARE_SIMD # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)") # define _PSTL_PRAGMA_SIMD_SCAN(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)") @@ -1444,7 +1576,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c # else // (defined(_OPENMP) && _OPENMP >= 201307) -# define _PSTL_PRAGMA_SIMD +# define _PSTL_PRAGMA_SIMD(...) # define _PSTL_PRAGMA_DECLARE_SIMD # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) # define _PSTL_PRAGMA_SIMD_SCAN(PRM) @@ -1454,6 +1586,11 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c # endif // (defined(_OPENMP) && _OPENMP >= 201307) +# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD +# define _PSTL_OMP_MAP_TO(...) +# define _PSTL_OMP_MAP_FROM(...) +# endif + # define _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED #endif // __cplusplus