diff --git a/ChangeLog b/ChangeLog index f25fb8cc..99994e51 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,21 @@ +2024-02-28 Dirk Eddelbuettel + + * DESCRIPTION (Version, Date): Release 0.3.4.0.0 + 2024-02-12 Tomas Kalibera * inst/include/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h: Support clang on Windows by including 'sched.h' header +2024-01-17 Yixuan Qiu + + * DESCRIPTION (Version, Date): Release candidate 0.3.3.99.0 + + * inst/include/Eigen: Upgraded to Eigen 3.4.0 + * inst/include/unsupported/Eigen: Idem + * patches/eigen-3.4.0.diff: Carried local CRAN patches forward + * patches/howToDiff.md: Idem + 2023-11-01 Dirk Eddelbuettel * DESCRIPTION (Version, Date): CRAN Release 0.3.3.9.4 diff --git a/DESCRIPTION b/DESCRIPTION index 3214b73a..31296800 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppEigen Type: Package Title: 'Rcpp' Integration for the 'Eigen' Templated Linear Algebra Library -Version: 0.3.3.9.4 -Date: 2023-11-01 +Version: 0.3.4.0.0 +Date: 2024-02-28 Author: Douglas Bates, Dirk Eddelbuettel, Romain Francois, and Yixuan Qiu; the authors of Eigen for the included version of Eigen Maintainer: Dirk Eddelbuettel diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index c95a4b34..a1497930 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -3,6 +3,16 @@ \newcommand{\ghpr}{\href{https://github.com/RcppCore/RcppEigen/pull/#1}{##1}} \newcommand{\ghit}{\href{https://github.com/RcppCore/RcppEigen/issues/#1}{##1}} +\section{Changes in RcppEigen version 0.3.4.0.0 (2024-02-28)}{ + \itemize{ + \item The Eigen version has been upgrade to release 3.4.0 (Yixuan) + \item Extensive reverse-dependency checks ensure only three out of over + 400 packages at CRAN are affected; PRs and patches helped other packages + \item The long-running branch also contains substantial contributions + from Mikael Jagan (for the lme4 interface) and Andrew Johnson (revdep PRs) + } +} + \section{Changes in RcppEigen version 0.3.3.9.4 (2023-11-01)}{ \itemize{ \item The CITATION file has been updated for the new \code{bibentry} diff --git a/inst/include/Eigen/Cholesky b/inst/include/Eigen/Cholesky index 1332b540..a318ceb7 100644 --- a/inst/include/Eigen/Cholesky +++ b/inst/include/Eigen/Cholesky @@ -43,4 +43,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CHOLESKY_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/CholmodSupport b/inst/include/Eigen/CholmodSupport index fea39040..22b897c6 100644 --- a/inst/include/Eigen/CholmodSupport +++ b/inst/include/Eigen/CholmodSupport @@ -41,3 +41,4 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CHOLMODSUPPORT_MODULE_H + diff --git a/inst/include/Eigen/Core b/inst/include/Eigen/Core index ac7c5b30..5921e15f 100644 --- a/inst/include/Eigen/Core +++ b/inst/include/Eigen/Core @@ -11,251 +11,55 @@ #ifndef EIGEN_CORE_H #define EIGEN_CORE_H -// first thing Eigen does: stop the compiler from committing suicide +// first thing Eigen does: stop the compiler from reporting useless warnings. #include "src/Core/util/DisableStupidWarnings.h" -#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) - #define EIGEN_CUDACC __CUDACC__ -#endif - -#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) - #define EIGEN_CUDA_ARCH __CUDA_ARCH__ -#endif - -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) -#elif defined(__CUDACC_VER__) -#define EIGEN_CUDACC_VER __CUDACC_VER__ -#else -#define EIGEN_CUDACC_VER 0 -#endif - -// Handle NVCC/CUDA/SYCL -#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) - // Do not try asserts on CUDA and SYCL! - #ifndef EIGEN_NO_DEBUG - #define EIGEN_NO_DEBUG - #endif - - #ifdef EIGEN_INTERNAL_DEBUGGING - #undef EIGEN_INTERNAL_DEBUGGING - #endif - - #ifdef EIGEN_EXCEPTIONS - #undef EIGEN_EXCEPTIONS - #endif - - // All functions callable from CUDA code must be qualified with __device__ - #ifdef __CUDACC__ - // Do not try to vectorize on CUDA and SYCL! - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif - - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include - #else - #define EIGEN_DEVICE_FUNC - #endif - -#else - #define EIGEN_DEVICE_FUNC +// then include this file where all our macros are defined. It's really important to do it first because +// it's where we do all the compiler/OS/arch detections and define most defaults. +#include "src/Core/util/Macros.h" -#endif +// This detects SSE/AVX/NEON/etc. and configure alignment settings +#include "src/Core/util/ConfigureVectorization.h" -// When compiling CUDA device code with NVCC, pull in math functions from the -// global namespace. In host mode, and when device doee with clang, use the -// std versions. -#if defined(__CUDA_ARCH__) && defined(__NVCC__) - #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; -#else - #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; +// We need cuda_runtime.h/hip_runtime.h to ensure that +// the EIGEN_USING_STD macro works properly on the device side +#if defined(EIGEN_CUDACC) + #include +#elif defined(EIGEN_HIPCC) + #include #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) - #define EIGEN_EXCEPTIONS -#endif #ifdef EIGEN_EXCEPTIONS #include #endif -// then include this file where all our macros are defined. It's really important to do it first because -// it's where we do all the alignment settings (platform detection and honoring the user's will if he -// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization. -#include "src/Core/util/Macros.h" - // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3) // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details. -#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) +#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5) #pragma GCC optimize ("-fno-ipa-cp-clone") #endif +// Prevent ICC from specializing std::complex operators that silently fail +// on device. This allows us to use our own device-compatible specializations +// instead. +#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \ + && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_) +#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1 +#endif #include // this include file manages BLAS and MKL related macros // and inclusion of their respective header files #include "src/Core/util/MKL_support.h" -// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into -// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks -#if EIGEN_MAX_ALIGN_BYTES==0 - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif -#endif -#if EIGEN_COMP_MSVC - #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled - #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later - // Remember that usage of defined() in a #define is undefined by the standard. - // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. - #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 - #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER - #endif - #endif -#else - // Remember that usage of defined() in a #define is undefined by the standard - #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) - #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC - #endif -#endif - -#ifndef EIGEN_DONT_VECTORIZE - - #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) - - // Defines symbols for compile-time detection of which instructions are - // used. - // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_SSE - #define EIGEN_VECTORIZE_SSE2 - - // Detect sse3/ssse3/sse4: - // gcc and icc defines __SSE3__, ... - // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you - // want to force the use of those instructions with msvc. - #ifdef __SSE3__ - #define EIGEN_VECTORIZE_SSE3 - #endif - #ifdef __SSSE3__ - #define EIGEN_VECTORIZE_SSSE3 - #endif - #ifdef __SSE4_1__ - #define EIGEN_VECTORIZE_SSE4_1 - #endif - #ifdef __SSE4_2__ - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX__ - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX2__ - #define EIGEN_VECTORIZE_AVX2 - #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif - #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512) - #define EIGEN_VECTORIZE_AVX512 - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_FMA - #ifdef __AVX512DQ__ - #define EIGEN_VECTORIZE_AVX512DQ - #endif - #ifdef __AVX512ER__ - #define EIGEN_VECTORIZE_AVX512ER - #endif - #endif - - // include files - - // This extern "C" works around a MINGW-w64 compilation issue - // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 - // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). - // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations - // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; - // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. - // notice that since these are C headers, the extern "C" is theoretically needed anyways. - extern "C" { - // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. - // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 - #include - #else - #include - #include - #include - #ifdef EIGEN_VECTORIZE_SSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_1 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_2 - #include - #endif - #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) - #include - #endif - #endif - } // end extern "C" - #elif defined __VSX__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_VSX - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif defined __ALTIVEC__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ALTIVEC - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif (defined __ARM_NEON) || (defined __ARM_NEON__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_NEON - #include - #elif (defined __s390x__ && defined __VEC__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ZVECTOR - #include - #endif -#endif - -#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) - // We can use the optimized fp16 to float and float to fp16 conversion routines - #define EIGEN_HAS_FP16_C -#endif - -#if defined __CUDACC__ - #define EIGEN_VECTORIZE_CUDA - #include - #if EIGEN_CUDACC_VER >= 70500 - #define EIGEN_HAS_CUDA_FP16 - #endif +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + #define EIGEN_HAS_GPU_FP16 #endif -#if defined EIGEN_HAS_CUDA_FP16 - #include - #include +#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16) + #define EIGEN_HAS_GPU_BF16 #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) @@ -290,6 +94,10 @@ // for min/max: #include +#if EIGEN_HAS_CXX11 +#include +#endif + // for std::is_nothrow_move_assignable #ifdef EIGEN_INCLUDE_TYPE_TRAITS #include @@ -305,38 +113,25 @@ #include #endif -/** \brief Namespace containing all symbols from the %Eigen library. */ -namespace Eigen { - -inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_AVX512) - return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_AVX) - return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_2) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_1) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; -#elif defined(EIGEN_VECTORIZE_SSSE3) - return "SSE, SSE2, SSE3, SSSE3"; -#elif defined(EIGEN_VECTORIZE_SSE3) - return "SSE, SSE2, SSE3"; -#elif defined(EIGEN_VECTORIZE_SSE2) - return "SSE, SSE2"; -#elif defined(EIGEN_VECTORIZE_ALTIVEC) - return "AltiVec"; -#elif defined(EIGEN_VECTORIZE_VSX) - return "VSX"; -#elif defined(EIGEN_VECTORIZE_NEON) - return "ARM NEON"; -#elif defined(EIGEN_VECTORIZE_ZVECTOR) - return "S390X ZVECTOR"; -#else - return "None"; +#if defined(EIGEN_USE_SYCL) + #undef min + #undef max + #undef isnan + #undef isinf + #undef isfinite + #include + #include + #include + #include + #include + #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0 + #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16 + #endif + #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1 + #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16 + #endif #endif -} -} // end namespace Eigen #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT // This will generate an error message: @@ -345,7 +140,7 @@ inline static const char *SimdInstructionSetsInUse(void) { namespace Eigen { -// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to +// we use size_t frequently and we'll never remember to prepend it with std:: every time just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t @@ -369,60 +164,90 @@ using std::ptrdiff_t; #include "src/Core/util/StaticAssert.h" #include "src/Core/util/XprHelper.h" #include "src/Core/util/Memory.h" +#include "src/Core/util/IntegralConstant.h" +#include "src/Core/util/SymbolicIndex.h" #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" #include "src/Core/arch/Default/ConjHelper.h" +// Generic half float support +#include "src/Core/arch/Default/Half.h" +#include "src/Core/arch/Default/BFloat16.h" +#include "src/Core/arch/Default/TypeCasting.h" +#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" - #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/SSE/TypeCasting.h" + #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" - #include "src/Core/arch/AVX/MathFunctions.h" + #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/PacketMath.h" + #include "src/Core/arch/AVX512/TypeCasting.h" + #include "src/Core/arch/AVX512/Complex.h" + #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h" #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/PacketMath.h" - #include "src/Core/arch/AVX/MathFunctions.h" - #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/TypeCasting.h" - #include "src/Core/arch/SSE/TypeCasting.h" + #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/MathFunctions.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/TypeCasting.h" #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/PacketMath.h" #include "src/Core/arch/AltiVec/MathFunctions.h" #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/TypeCasting.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" +#elif defined EIGEN_VECTORIZE_SVE + #include "src/Core/arch/SVE/PacketMath.h" + #include "src/Core/arch/SVE/TypeCasting.h" + #include "src/Core/arch/SVE/MathFunctions.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" #include "src/Core/arch/ZVector/Complex.h" +#elif defined EIGEN_VECTORIZE_MSA + #include "src/Core/arch/MSA/PacketMath.h" + #include "src/Core/arch/MSA/MathFunctions.h" + #include "src/Core/arch/MSA/Complex.h" #endif -// Half float support -#include "src/Core/arch/CUDA/Half.h" -#include "src/Core/arch/CUDA/PacketMathHalf.h" -#include "src/Core/arch/CUDA/TypeCasting.h" +#if defined EIGEN_VECTORIZE_GPU + #include "src/Core/arch/GPU/PacketMath.h" + #include "src/Core/arch/GPU/MathFunctions.h" + #include "src/Core/arch/GPU/TypeCasting.h" +#endif -#if defined EIGEN_VECTORIZE_CUDA - #include "src/Core/arch/CUDA/PacketMath.h" - #include "src/Core/arch/CUDA/MathFunctions.h" +#if defined(EIGEN_USE_SYCL) + #include "src/Core/arch/SYCL/SyclMemoryModel.h" + #include "src/Core/arch/SYCL/InteropHeaders.h" +#if !defined(EIGEN_DONT_VECTORIZE_SYCL) + #include "src/Core/arch/SYCL/PacketMath.h" + #include "src/Core/arch/SYCL/MathFunctions.h" + #include "src/Core/arch/SYCL/TypeCasting.h" +#endif #endif #include "src/Core/arch/Default/Settings.h" +// This file provides generic implementations valid for scalar as well +#include "src/Core/arch/Default/GenericPacketMathFunctions.h" #include "src/Core/functors/TernaryFunctors.h" #include "src/Core/functors/BinaryFunctors.h" @@ -433,9 +258,16 @@ using std::ptrdiff_t; // Specialized functors to enable the processing of complex numbers // on CUDA devices +#ifdef EIGEN_CUDACC #include "src/Core/arch/CUDA/Complex.h" +#endif -#include "src/Core/IO.h" +#include "src/Core/util/IndexedViewHelper.h" +#include "src/Core/util/ReshapedHelper.h" +#include "src/Core/ArithmeticSequence.h" +#ifndef EIGEN_NO_IO + #include "src/Core/IO.h" +#endif #include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseBase.h" #include "src/Core/MatrixBase.h" @@ -476,6 +308,8 @@ using std::ptrdiff_t; #include "src/Core/Ref.h" #include "src/Core/Block.h" #include "src/Core/VectorBlock.h" +#include "src/Core/IndexedView.h" +#include "src/Core/Reshaped.h" #include "src/Core/Transpose.h" #include "src/Core/DiagonalMatrix.h" #include "src/Core/Diagonal.h" @@ -512,13 +346,21 @@ using std::ptrdiff_t; #include "src/Core/CoreIterators.h" #include "src/Core/ConditionEstimator.h" +#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) + #include "src/Core/arch/AltiVec/MatrixProduct.h" +#elif defined EIGEN_VECTORIZE_NEON + #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" +#endif + #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" +#include "src/Core/PartialReduxEvaluator.h" #include "src/Core/Random.h" #include "src/Core/Replicate.h" #include "src/Core/Reverse.h" #include "src/Core/ArrayWrapper.h" +#include "src/Core/StlIterators.h" #ifdef EIGEN_USE_BLAS #include "src/Core/products/GeneralMatrixMatrix_BLAS.h" diff --git a/inst/include/Eigen/Eigenvalues b/inst/include/Eigen/Eigenvalues index 7d6ac787..5467a2e7 100644 --- a/inst/include/Eigen/Eigenvalues +++ b/inst/include/Eigen/Eigenvalues @@ -58,4 +58,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_EIGENVALUES_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/Geometry b/inst/include/Eigen/Geometry index da88c03b..bc78110a 100644 --- a/inst/include/Eigen/Geometry +++ b/inst/include/Eigen/Geometry @@ -49,14 +49,11 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX -#include "src/Geometry/arch/Geometry_SSE.h" +// Use the SSE optimized version whenever possible. +#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON) +#include "src/Geometry/arch/Geometry_SIMD.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_GEOMETRY_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ - diff --git a/inst/include/Eigen/Householder b/inst/include/Eigen/Householder index 89cd81b1..f2fa7996 100644 --- a/inst/include/Eigen/Householder +++ b/inst/include/Eigen/Householder @@ -27,4 +27,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_HOUSEHOLDER_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/Jacobi b/inst/include/Eigen/Jacobi index 17c1d785..43edc7a1 100644 --- a/inst/include/Eigen/Jacobi +++ b/inst/include/Eigen/Jacobi @@ -29,5 +29,4 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_JACOBI_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/KLUSupport b/inst/include/Eigen/KLUSupport new file mode 100644 index 00000000..b23d9053 --- /dev/null +++ b/inst/include/Eigen/KLUSupport @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_MODULE_H +#define EIGEN_KLUSUPPORT_MODULE_H + +#include + +#include + +extern "C" { +#include +#include + } + +/** \ingroup Support_modules + * \defgroup KLUSupport_Module KLUSupport module + * + * This module provides an interface to the KLU library which is part of the suitesparse package. + * It provides the following factorization class: + * - class KLU: a sparse LU factorization, well-suited for circuit simulation. + * + * \code + * #include + * \endcode + * + * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies. + * The dependencies depend on how umfpack has been compiled. + * For a cmake based project, you can use our FindKLU.cmake module to help you in this task. + * + */ + +#include "src/KLUSupport/KLUSupport.h" + +#include + +#endif // EIGEN_KLUSUPPORT_MODULE_H diff --git a/inst/include/Eigen/LU b/inst/include/Eigen/LU index 6418a86e..1236ceb0 100644 --- a/inst/include/Eigen/LU +++ b/inst/include/Eigen/LU @@ -38,13 +38,10 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX - #include "src/LU/arch/Inverse_SSE.h" +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON + #include "src/LU/arch/InverseSize4.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_LU_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/OrderingMethods b/inst/include/Eigen/OrderingMethods index d8ea3619..29691a62 100644 --- a/inst/include/Eigen/OrderingMethods +++ b/inst/include/Eigen/OrderingMethods @@ -63,10 +63,7 @@ * \endcode */ -#ifndef EIGEN_MPL2_ONLY #include "src/OrderingMethods/Amd.h" -#endif - #include "src/OrderingMethods/Ordering.h" #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/inst/include/Eigen/PaStiXSupport b/inst/include/Eigen/PaStiXSupport index de3a63b4..234619ac 100644 --- a/inst/include/Eigen/PaStiXSupport +++ b/inst/include/Eigen/PaStiXSupport @@ -36,6 +36,7 @@ extern "C" { * \endcode * * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies. + * This wrapper resuires PaStiX version 5.x compiled without MPI support. * The dependencies depend on how PaSTiX has been compiled. * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task. * diff --git a/inst/include/Eigen/PardisoSupport b/inst/include/Eigen/PardisoSupport old mode 100755 new mode 100644 diff --git a/inst/include/Eigen/QR b/inst/include/Eigen/QR index 1be1863a..8465b62c 100644 --- a/inst/include/Eigen/QR +++ b/inst/include/Eigen/QR @@ -48,4 +48,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_QR_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/QtAlignedMalloc b/inst/include/Eigen/QtAlignedMalloc index 4f07df02..6fe82374 100644 --- a/inst/include/Eigen/QtAlignedMalloc +++ b/inst/include/Eigen/QtAlignedMalloc @@ -37,4 +37,3 @@ void *qRealloc(void *ptr, std::size_t size) #endif #endif // EIGEN_QTMALLOC_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/SVD b/inst/include/Eigen/SVD index 5d0e75f7..34517949 100644 --- a/inst/include/Eigen/SVD +++ b/inst/include/Eigen/SVD @@ -48,4 +48,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_SVD_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/Eigen/Sparse b/inst/include/Eigen/Sparse index 136e681a..a2ef7a66 100644 --- a/inst/include/Eigen/Sparse +++ b/inst/include/Eigen/Sparse @@ -25,9 +25,7 @@ #include "SparseCore" #include "OrderingMethods" -#ifndef EIGEN_MPL2_ONLY #include "SparseCholesky" -#endif #include "SparseLU" #include "SparseQR" #include "IterativeLinearSolvers" diff --git a/inst/include/Eigen/SparseCholesky b/inst/include/Eigen/SparseCholesky index b6a320c4..d2b1f127 100644 --- a/inst/include/Eigen/SparseCholesky +++ b/inst/include/Eigen/SparseCholesky @@ -30,16 +30,8 @@ * \endcode */ -#ifdef EIGEN_MPL2_ONLY -#error The SparseCholesky module has nothing to offer in MPL2 only mode -#endif - #include "src/SparseCholesky/SimplicialCholesky.h" - -#ifndef EIGEN_MPL2_ONLY #include "src/SparseCholesky/SimplicialCholesky_impl.h" -#endif - #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_SPARSECHOLESKY_MODULE_H diff --git a/inst/include/Eigen/SparseLU b/inst/include/Eigen/SparseLU index 38b38b53..37c4a5c5 100644 --- a/inst/include/Eigen/SparseLU +++ b/inst/include/Eigen/SparseLU @@ -23,6 +23,8 @@ // Ordering interface #include "OrderingMethods" +#include "src/Core/util/DisableStupidWarnings.h" + #include "src/SparseLU/SparseLU_gemm_kernel.h" #include "src/SparseLU/SparseLU_Structs.h" @@ -43,4 +45,6 @@ #include "src/SparseLU/SparseLU_Utils.h" #include "src/SparseLU/SparseLU.h" +#include "src/Core/util/ReenableStupidWarnings.h" + #endif // EIGEN_SPARSELU_MODULE_H diff --git a/inst/include/Eigen/src/Cholesky/LDLT.h b/inst/include/Eigen/src/Cholesky/LDLT.h index 15ccf24f..1013ca04 100644 --- a/inst/include/Eigen/src/Cholesky/LDLT.h +++ b/inst/include/Eigen/src/Cholesky/LDLT.h @@ -16,6 +16,15 @@ namespace Eigen { namespace internal { + template struct traits > + : traits<_MatrixType> + { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; + }; + template struct LDLT_Traits; // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef @@ -36,7 +45,7 @@ namespace internal { * matrix \f$ A \f$ such that \f$ A = P^TLDL^*P \f$, where P is a permutation matrix, L * is lower triangular with a unit diagonal and D is a diagonal matrix. * - * The decomposition uses pivoting to ensure stability, so that L will have + * The decomposition uses pivoting to ensure stability, so that D will have * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root * on D also stabilizes the computation. * @@ -44,24 +53,23 @@ namespace internal { * decomposition to determine whether a system of equations has a solution. * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template class LDLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, UpLo = _UpLo }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix TmpMatrixType; typedef Transpositions TranspositionType; @@ -180,6 +188,7 @@ template class LDLT return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A. * * This function also supports in-place solves using the syntax x = decompositionObject.solve(x) . @@ -191,19 +200,14 @@ template class LDLT * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function - * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular. + * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular. * * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt() */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LDLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LDLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template bool solveInPlace(MatrixBase &bAndX) const; @@ -242,12 +246,12 @@ template class LDLT */ const LDLT& adjoint() const { return *this; }; - inline Index rows() const { return m_matrix.rows(); } - inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const @@ -258,8 +262,10 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -560,14 +566,22 @@ template template void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ // dst = P b dst = m_transpositions * rhs; // dst = L^-1 (P b) - matrixL().solveInPlace(dst); + // dst = L^-*T (P b) + matrixL().template conjugateIf().solveInPlace(dst); - // dst = D^-1 (L^-1 P b) + // dst = D^-* (L^-1 P b) + // dst = D^-1 (L^-*T P b) // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); @@ -579,7 +593,6 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. // Using numeric_limits::min() gives us more robustness to denormals. RealScalar tolerance = (std::numeric_limits::min)(); - for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) @@ -588,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons dst.row(i).setZero(); } - // dst = L^-T (D^-1 L^-1 P b) - matrixU().solveInPlace(dst); + // dst = L^-* (D^-* L^-1 P b) + // dst = L^-T (D^-1 L^-*T P b) + matrixL().transpose().template conjugateIf().solveInPlace(dst); - // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b + // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b + // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b dst = m_transpositions.transpose() * dst; } #endif diff --git a/inst/include/Eigen/src/Cholesky/LLT.h b/inst/include/Eigen/src/Cholesky/LLT.h index e1624d21..8c9b2b39 100644 --- a/inst/include/Eigen/src/Cholesky/LLT.h +++ b/inst/include/Eigen/src/Cholesky/LLT.h @@ -13,6 +13,16 @@ namespace Eigen { namespace internal{ + +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + template struct LLT_Traits; } @@ -54,18 +64,17 @@ template struct LLT_Traits; * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ template class LLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; enum { PacketSize = internal::packet_traits::size, @@ -100,7 +109,7 @@ template class LLT compute(matrix.derived()); } - /** \brief Constructs a LDLT factorization from a given matrix + /** \brief Constructs a LLT factorization from a given matrix * * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when * \c MatrixType is a Eigen::Ref. @@ -129,6 +138,7 @@ template class LLT return Traits::getL(m_matrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A. * * Since this LLT class assumes anyway that the matrix A is invertible, the solution @@ -141,13 +151,8 @@ template class LLT */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template void solveInPlace(const MatrixBase &bAndX) const; @@ -180,7 +185,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const @@ -194,18 +199,20 @@ template class LLT * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: * \code x = decomposition.adjoint().solve(b) \endcode */ - const LLT& adjoint() const { return *this; }; + const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; }; - inline Index rows() const { return m_matrix.rows(); } - inline Index cols() const { return m_matrix.cols(); } + inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } template - LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); + LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -459,7 +466,7 @@ LLT& LLT::compute(const EigenBase */ template template -LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) +LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType); eigen_assert(v.size()==m_matrix.cols()); @@ -477,8 +484,17 @@ template template void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - dst = rhs; - solveInPlace(dst); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + dst = rhs; + + matrixL().template conjugateIf().solveInPlace(dst); + matrixU().template conjugateIf().solveInPlace(dst); } #endif diff --git a/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h b/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h index 5a7056a5..20f28bc1 100644 --- a/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h @@ -14,7 +14,7 @@ # define R_MATRIX_CHOLMOD(_NAME_) cholmod_ ## _NAME_ #endif -namespace Eigen { +namespace Eigen { namespace internal { @@ -36,7 +36,7 @@ template<> struct cholmod_configure_matrix > { } }; -// Other scalar types are not yet suppotred by Cholmod +// Other scalar types are not yet supported by Cholmod // template<> struct cholmod_configure_matrix { // template // static void run(CholmodType& mat) { @@ -83,15 +83,15 @@ cholmod_sparse viewAsCholmod(Ref > res.dtype = 0; res.stype = -1; - + if (internal::is_same<_StorageIndex,int>::value) { res.itype = CHOLMOD_INT; } - else if (internal::is_same<_StorageIndex,long>::value) - { - res.itype = CHOLMOD_LONG; - } + // else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value) + // { + // res.itype = CHOLMOD_LONG; + // } else { eigen_assert(false && "Index type not supported yet"); @@ -99,9 +99,9 @@ cholmod_sparse viewAsCholmod(Ref > // setup res.xtype internal::cholmod_configure_matrix<_Scalar>::run(res); - + res.stype = 0; - + return res; } @@ -125,9 +125,12 @@ template cholmod_sparse viewAsCholmod(const SparseSelfAdjointView, UpLo>& mat) { cholmod_sparse res = viewAsCholmod(Ref >(mat.matrix().const_cast_derived())); - + if(UpLo==Upper) res.stype = 1; if(UpLo==Lower) res.stype = -1; + // swap stype for rowmajor matrices (only works for real matrices) + EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + if(_Options & RowMajorBit) res.stype *=-1; return res; } @@ -163,6 +166,42 @@ MappedSparseMatrix viewAsEigen(cholmod_sparse& cm) static_cast(cm.p), static_cast(cm.i),static_cast(cm.x) ); } +namespace internal { + +// template specializations for int and long that call the correct cholmod method + +#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \ + template inline ret cm_ ## name (cholmod_common &Common) { return R_MATRIX_CHOLMOD(name) (&Common); } + +#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \ + template inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return R_MATRIX_CHOLMOD(name) (&a1, &Common); } + +EIGEN_CHOLMOD_SPECIALIZE0(int, start) +EIGEN_CHOLMOD_SPECIALIZE0(int, finish) + +EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L) +EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense, cholmod_dense*, X) +EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A) + +EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) + +template inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return R_MATRIX_CHOLMOD(solve) (sys, &L, &B, &Common); } +// template<> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return R_MATRIX_CHOLMOD(l_solve) (sys, &L, &B, &Common); } + +template inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return R_MATRIX_CHOLMOD(spsolve) (sys, &L, &B, &Common); } +// template<> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return R_MATRIX_CHOLMOD(l_spsolve) (sys, &L, &B, &Common); } + +template +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return R_MATRIX_CHOLMOD(factorize_p) (A, beta, fset, fsize, L, &Common); } +// template<> +// inline int cm_factorize_p (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return R_MATRIX_CHOLMOD(l_factorize_p) (A, beta, fset, fsize, L, &Common); } + +#undef EIGEN_CHOLMOD_SPECIALIZE0 +#undef EIGEN_CHOLMOD_SPECIALIZE1 + +} // namespace internal + + enum CholmodMode { CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt }; @@ -199,7 +238,7 @@ class CholmodBase : public SparseSolverBase { EIGEN_STATIC_ASSERT((internal::is_same::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY); m_shiftOffset[0] = m_shiftOffset[1] = 0.0; - R_MATRIX_CHOLMOD(start)(&m_cholmod); + internal::cm_start(m_cholmod); } explicit CholmodBase(const MatrixType& matrix) @@ -207,23 +246,23 @@ class CholmodBase : public SparseSolverBase { EIGEN_STATIC_ASSERT((internal::is_same::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY); m_shiftOffset[0] = m_shiftOffset[1] = 0.0; - R_MATRIX_CHOLMOD(start)(&m_cholmod); + internal::cm_start(m_cholmod); compute(matrix); } ~CholmodBase() { if(m_cholmodFactor) - R_MATRIX_CHOLMOD(free_factor)(&m_cholmodFactor, &m_cholmod); - R_MATRIX_CHOLMOD(finish)(&m_cholmod); + internal::cm_free_factor(m_cholmodFactor, m_cholmod); + internal::cm_finish(m_cholmod); } - + inline StorageIndex cols() const { return internal::convert_index(m_cholmodFactor->n); } inline StorageIndex rows() const { return internal::convert_index(m_cholmodFactor->n); } - + /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -239,29 +278,29 @@ class CholmodBase : public SparseSolverBase factorize(matrix); return derived(); } - + /** Performs a symbolic decomposition on the sparsity pattern of \a matrix. * * This function is particularly useful when solving for several problems having the same structure. - * + * * \sa factorize() */ void analyzePattern(const MatrixType& matrix) { if(m_cholmodFactor) { - R_MATRIX_CHOLMOD(free_factor)(&m_cholmodFactor, &m_cholmod); + internal::cm_free_factor(m_cholmodFactor, m_cholmod); m_cholmodFactor = 0; } cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView()); - m_cholmodFactor = R_MATRIX_CHOLMOD(analyze)(&A, &m_cholmod); - + m_cholmodFactor = internal::cm_analyze(A, m_cholmod); + this->m_isInitialized = true; this->m_info = Success; m_analysisIsOk = true; m_factorizationIsOk = false; } - + /** Performs a numeric decomposition of \a matrix * * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed. @@ -272,17 +311,17 @@ class CholmodBase : public SparseSolverBase { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView()); - R_MATRIX_CHOLMOD(factorize_p)(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod); + internal::cm_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod); // If the factorization failed, minor is the column at which it did. On success minor == n. this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue); m_factorizationIsOk = true; } - + /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations. * See the Cholmod user guide for details. */ cholmod_common& cholmod() { return m_cholmod; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal */ template @@ -292,22 +331,23 @@ class CholmodBase : public SparseSolverBase const Index size = m_cholmodFactor->n; EIGEN_UNUSED_VARIABLE(size); eigen_assert(size==b.rows()); - - // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref. + + // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref. Ref > b_ref(b.derived()); cholmod_dense b_cd = viewAsCholmod(b_ref); - cholmod_dense* x_cd = R_MATRIX_CHOLMOD(solve)(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod); + cholmod_dense* x_cd = internal::cm_solve(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod); if(!x_cd) { this->m_info = NumericalIssue; return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) + // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve dest = Matrix::Map(reinterpret_cast(x_cd->x),b.rows(),b.cols()); - R_MATRIX_CHOLMOD(free_dense)(&x_cd, &m_cholmod); + internal::cm_free_dense(x_cd, m_cholmod); } - + /** \internal */ template void _solve_impl(const SparseMatrixBase &b, SparseMatrixBase &dest) const @@ -320,19 +360,20 @@ class CholmodBase : public SparseSolverBase // note: cs stands for Cholmod Sparse Ref > b_ref(b.const_cast_derived()); cholmod_sparse b_cs = viewAsCholmod(b_ref); - cholmod_sparse* x_cs = R_MATRIX_CHOLMOD(spsolve)(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod); + cholmod_sparse* x_cs = internal::cm_spsolve(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod); if(!x_cs) { this->m_info = NumericalIssue; return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) + // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver) dest.derived() = viewAsEigen(*x_cs); - R_MATRIX_CHOLMOD(free_sparse)(&x_cs, &m_cholmod); + internal::cm_free_sparse(x_cs, m_cholmod); } #endif // EIGEN_PARSED_BY_DOXYGEN - - + + /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization. * * During the numerical factorization, an offset term is added to the diagonal coefficients:\n @@ -347,7 +388,7 @@ class CholmodBase : public SparseSolverBase m_shiftOffset[0] = double(offset); return derived(); } - + /** \returns the determinant of the underlying matrix from the current factorization */ Scalar determinant() const { @@ -402,7 +443,7 @@ class CholmodBase : public SparseSolverBase template void dumpMemory(Stream& /*s*/) {} - + protected: mutable cholmod_common m_cholmod; cholmod_factor* m_cholmodFactor; @@ -439,11 +480,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLLT() : Base() { init(); } CholmodSimplicialLLT(const MatrixType& matrix) : Base() @@ -490,11 +531,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLDLT() : Base() { init(); } CholmodSimplicialLDLT(const MatrixType& matrix) : Base() @@ -539,11 +580,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper { typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSupernodalLLT() : Base() { init(); } CholmodSupernodalLLT(const MatrixType& matrix) : Base() @@ -590,11 +631,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom { typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodDecomposition() : Base() { init(); } CholmodDecomposition(const MatrixType& matrix) : Base() @@ -604,7 +645,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom } ~CholmodDecomposition() {} - + void setMode(CholmodMode mode) { switch(mode) diff --git a/inst/include/Eigen/src/Core/ArithmeticSequence.h b/inst/include/Eigen/src/Core/ArithmeticSequence.h new file mode 100644 index 00000000..b6200fac --- /dev/null +++ b/inst/include/Eigen/src/Core/ArithmeticSequence.h @@ -0,0 +1,413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARITHMETIC_SEQUENCE_H +#define EIGEN_ARITHMETIC_SEQUENCE_H + +namespace Eigen { + +namespace internal { + +#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) +template struct aseq_negate {}; + +template<> struct aseq_negate { + typedef Index type; +}; + +template struct aseq_negate > { + typedef FixedInt<-N> type; +}; + +// Compilation error in the following case: +template<> struct aseq_negate > {}; + +template::value, + bool SizeIsSymbolic =symbolic::is_symbolic::value> +struct aseq_reverse_first_type { + typedef Index type; +}; + +template +struct aseq_reverse_first_type { + typedef symbolic::AddExpr > >, + symbolic::ValueExpr > + > type; +}; + +template +struct aseq_reverse_first_type_aux { + typedef Index type; +}; + +template +struct aseq_reverse_first_type_aux::type> { + typedef FixedInt<(SizeType::value-1)*IncrType::value> type; +}; + +template +struct aseq_reverse_first_type { + typedef typename aseq_reverse_first_type_aux::type Aux; + typedef symbolic::AddExpr > type; +}; + +template +struct aseq_reverse_first_type { + typedef symbolic::AddExpr > >, + symbolic::ValueExpr >, + symbolic::ValueExpr<> > type; +}; +#endif + +// Helper to cleanup the type of the increment: +template struct cleanup_seq_incr { + typedef typename cleanup_index_type::type type; +}; + +} + +//-------------------------------------------------------------------------------- +// seq(first,last,incr) and seqN(first,size,incr) +//-------------------------------------------------------------------------------- + +template > +class ArithmeticSequence; + +template +ArithmeticSequence::type, + typename internal::cleanup_index_type::type, + typename internal::cleanup_seq_incr::type > +seqN(FirstType first, SizeType size, IncrType incr); + +/** \class ArithmeticSequence + * \ingroup Core_Module + * + * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by + * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride) + * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i. + * + * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments + * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the + * only way it is used. + * + * \tparam FirstType type of the first element, usually an Index, + * but internally it can be a symbolic expression + * \tparam SizeType type representing the size of the sequence, usually an Index + * or a compile time integral constant. Internally, it can also be a symbolic expression + * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1) + * + * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView + */ +template +class ArithmeticSequence +{ +public: + ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {} + ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {} + + enum { + SizeAtCompileTime = internal::get_fixed_value::value, + IncrAtCompileTime = internal::get_fixed_value::value + }; + + /** \returns the size, i.e., number of elements, of the sequence */ + Index size() const { return m_size; } + + /** \returns the first element \f$ a_0 \f$ in the sequence */ + Index first() const { return m_first; } + + /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */ + Index operator[](Index i) const { return m_first + i * m_incr; } + + const FirstType& firstObject() const { return m_first; } + const SizeType& sizeObject() const { return m_size; } + const IncrType& incrObject() const { return m_incr; } + +protected: + FirstType m_first; + SizeType m_size; + IncrType m_incr; + +public: + +#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) + auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) { + return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); + } +#else +protected: + typedef typename internal::aseq_negate::type ReverseIncrType; + typedef typename internal::aseq_reverse_first_type::type ReverseFirstType; +public: + ArithmeticSequence + reverse() const { + return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); + } +#endif +}; + +/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr + * + * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type > +seqN(FirstType first, SizeType size, IncrType incr) { + return ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type>(first,size,incr); +} + +/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment + * + * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */ +template +ArithmeticSequence::type,typename internal::cleanup_index_type::type > +seqN(FirstType first, SizeType size) { + return ArithmeticSequence::type,typename internal::cleanup_index_type::type>(first,size); +} + +#ifdef EIGEN_PARSED_BY_DOXYGEN + +/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr + * + * It is essentially an alias to: + * \code + * seqN(f, (l-f+incr)/incr, incr); + * \endcode + * + * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) + */ +template +auto seq(FirstType f, LastType l, IncrType incr); + +/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment + * + * It is essentially an alias to: + * \code + * seqN(f,l-f+1); + * \endcode + * + * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) + */ +template +auto seq(FirstType f, LastType l); + +#else // EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX11 +template +auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + - typename internal::cleanup_index_type::type(f)+fix<1>()))) +{ + return seqN(typename internal::cleanup_index_type::type(f), + (typename internal::cleanup_index_type::type(l) + -typename internal::cleanup_index_type::type(f)+fix<1>())); +} + +template +auto seq(FirstType f, LastType l, IncrType incr) + -> decltype(seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + - typename internal::cleanup_index_type::type(f)+typename internal::cleanup_seq_incr::type(incr) + ) / typename internal::cleanup_seq_incr::type(incr), + typename internal::cleanup_seq_incr::type(incr))) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + -typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr), + CleanedIncrType(incr)); +} + +#else // EIGEN_HAS_CXX11 + +template +typename internal::enable_if::value || symbolic::is_symbolic::value), + ArithmeticSequence::type,Index> >::type +seq(FirstType f, LastType l) +{ + return seqN(typename internal::cleanup_index_type::type(f), + Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+fix<1>()))); +} + +template +typename internal::enable_if::value, + ArithmeticSequence,symbolic::ValueExpr<> >, + symbolic::ValueExpr > > > >::type +seq(const symbolic::BaseExpr &f, LastType l) +{ + return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+fix<1>())); +} + +template +typename internal::enable_if::value, + ArithmeticSequence::type, + symbolic::AddExpr >, + symbolic::ValueExpr > > > >::type +seq(FirstType f, const symbolic::BaseExpr &l) +{ + return seqN(typename internal::cleanup_index_type::type(f),(l.derived()-typename internal::cleanup_index_type::type(f)+fix<1>())); +} + +template +ArithmeticSequence >,symbolic::ValueExpr > > > +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l) +{ + return seqN(f.derived(),(l.derived()-f.derived()+fix<1>())); +} + + +template +typename internal::enable_if::value || symbolic::is_symbolic::value), + ArithmeticSequence::type,Index,typename internal::cleanup_seq_incr::type> >::type +seq(FirstType f, LastType l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr); +} + +template +typename internal::enable_if::value, + ArithmeticSequence, + symbolic::ValueExpr<> >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> >::type +seq(const symbolic::BaseExpr &f, LastType l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} + +template +typename internal::enable_if::value, + ArithmeticSequence::type, + symbolic::QuotientExpr >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> >::type +seq(FirstType f, const symbolic::BaseExpr &l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + (l.derived()-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} + +template +ArithmeticSequence >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} +#endif // EIGEN_HAS_CXX11 + +#endif // EIGEN_PARSED_BY_DOXYGEN + + +#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN) +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr. + * + * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode + * + * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +auto lastN(SizeType size, IncrType incr) +-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) +{ + return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); +} + +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment. + * + * It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode + * + * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */ +template +auto lastN(SizeType size) +-> decltype(seqN(Eigen::last+fix<1>()-size, size)) +{ + return seqN(Eigen::last+fix<1>()-size, size); +} +#endif + +namespace internal { + +// Convert a symbolic span into a usable one (i.e., remove last/end "keywords") +template +struct make_size_type { + typedef typename internal::conditional::value, Index, T>::type type; +}; + +template +struct IndexedViewCompatibleType, XprSize> { + typedef ArithmeticSequence::type,IncrType> type; +}; + +template +ArithmeticSequence::type,IncrType> +makeIndexedViewCompatible(const ArithmeticSequence& ids, Index size,SpecializedType) { + return ArithmeticSequence::type,IncrType>( + eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject()); +} + +template +struct get_compile_time_incr > { + enum { value = get_fixed_value::value }; +}; + +} // end namespace internal + +/** \namespace Eigen::indexing + * \ingroup Core_Module + * + * The sole purpose of this namespace is to be able to import all functions + * and symbols that are expected to be used within operator() for indexing + * and slicing. If you already imported the whole Eigen namespace: + * \code using namespace Eigen; \endcode + * then you are already all set. Otherwise, if you don't want/cannot import + * the whole Eigen namespace, the following line: + * \code using namespace Eigen::indexing; \endcode + * is equivalent to: + * \code + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + using Eigen::lastN; // c++11 only + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; + \endcode + */ +namespace indexing { + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + #if EIGEN_HAS_CXX11 + using Eigen::lastN; + #endif + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; +} + +} // end namespace Eigen + +#endif // EIGEN_ARITHMETIC_SEQUENCE_H diff --git a/inst/include/Eigen/src/Core/Array.h b/inst/include/Eigen/src/Core/Array.h index 16770fc7..20c789b1 100644 --- a/inst/include/Eigen/src/Core/Array.h +++ b/inst/include/Eigen/src/Core/Array.h @@ -117,7 +117,7 @@ class Array { return Base::_set(other); } - + /** Default constructor. * * For fixed-size matrices, does nothing. @@ -157,11 +157,50 @@ class Array EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { - other.swap(*this); + Base::operator=(std::move(other)); return *this; } #endif + #if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + * + * Example: \include Array_variadic_ctor_cxx11.cpp + * Output: \verbinclude Array_variadic_ctor_cxx11.out + * + * \sa Array(const std::initializer_list>&) + * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Array_initializer_list_23_cxx11.cpp + * Output: \verbinclude Array_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed. + * Therefore Array{{1,2,3,4,5}} is legal and the more verbose syntax + * Array{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Array_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Array_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes, + * and implicit transposition is allowed for compile-time 1D arrays only. + * + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const std::initializer_list>& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -178,6 +217,7 @@ class Array Base::_check_template_params(); this->template _init2(val0, val1); } + #else /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); @@ -189,7 +229,8 @@ class Array */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(Index dim); - /** constructs an initialized 1x1 Array with the given coefficient */ + /** constructs an initialized 1x1 Array with the given coefficient + * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */ Array(const Scalar& value); /** constructs an uninitialized array with \a rows rows and \a cols columns. * @@ -197,11 +238,14 @@ class Array * it is redundant to pass these parameters, so one should use the default constructor * Array() instead. */ Array(Index rows, Index cols); - /** constructs an initialized 2D vector with given coefficients */ + /** constructs an initialized 2D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ Array(const Scalar& val0, const Scalar& val1); - #endif + #endif // end EIGEN_PARSED_BY_DOXYGEN - /** constructs an initialized 3D vector with given coefficients */ + /** constructs an initialized 3D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) { @@ -211,7 +255,9 @@ class Array m_storage.data()[1] = val1; m_storage.data()[2] = val2; } - /** constructs an initialized 4D vector with given coefficients */ + /** constructs an initialized 4D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) { @@ -242,8 +288,10 @@ class Array : Base(other.derived()) { } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } #ifdef EIGEN_ARRAY_PLUGIN #include EIGEN_ARRAY_PLUGIN @@ -258,7 +306,7 @@ class Array /** \defgroup arraytypedefs Global array typedefs * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common 1D and 2D array types. + * %Eigen defines several typedef shortcuts for most common 1D and 2D array types. * * The general patterns are the following: * @@ -271,6 +319,12 @@ class Array * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is * a fixed-size 1D array of 4 complex floats. * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `ArrayRowsCols` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size. + * - `ArraySize` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays. + * * \sa class Array */ @@ -303,8 +357,42 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES #undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS + +#if EIGEN_HAS_CXX11 + +#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##SizeSuffix##SizeSuffix = Array; \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##SizeSuffix = Array; + +#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##Size##X = Array; \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##X##Size = Array; + +EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2) +EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3) +EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4) +EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4) + +#undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS -#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE +#endif // EIGEN_HAS_CXX11 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ using Eigen::Matrix##SizeSuffix##TypeSuffix; \ diff --git a/inst/include/Eigen/src/Core/ArrayBase.h b/inst/include/Eigen/src/Core/ArrayBase.h index 33f644e2..ea3dd1c3 100644 --- a/inst/include/Eigen/src/Core/ArrayBase.h +++ b/inst/include/Eigen/src/Core/ArrayBase.h @@ -69,6 +69,7 @@ template class ArrayBase using Base::coeff; using Base::coeffRef; using Base::lazyAssign; + using Base::operator-; using Base::operator=; using Base::operator+=; using Base::operator-=; @@ -88,7 +89,6 @@ template class ArrayBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) -# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/ArrayCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" diff --git a/inst/include/Eigen/src/Core/ArrayWrapper.h b/inst/include/Eigen/src/Core/ArrayWrapper.h index 688aadd6..2e9555b5 100644 --- a/inst/include/Eigen/src/Core/ArrayWrapper.h +++ b/inst/include/Eigen/src/Core/ArrayWrapper.h @@ -10,7 +10,7 @@ #ifndef EIGEN_ARRAYWRAPPER_H #define EIGEN_ARRAYWRAPPER_H -namespace Eigen { +namespace Eigen { /** \class ArrayWrapper * \ingroup Core_Module @@ -60,14 +60,14 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } @@ -90,9 +90,9 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { dst = m_expression; } - const typename internal::remove_all::type& EIGEN_DEVICE_FUNC - nestedExpression() const + const typename internal::remove_all::type& + nestedExpression() const { return m_expression; } @@ -158,14 +158,14 @@ class MatrixWrapper : public MatrixBase > EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } @@ -185,8 +185,8 @@ class MatrixWrapper : public MatrixBase > } EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const + const typename internal::remove_all::type& + nestedExpression() const { return m_expression; } diff --git a/inst/include/Eigen/src/Core/Assign.h b/inst/include/Eigen/src/Core/Assign.h index 53806ba3..655412ef 100644 --- a/inst/include/Eigen/src/Core/Assign.h +++ b/inst/include/Eigen/src/Core/Assign.h @@ -16,7 +16,7 @@ namespace Eigen { template template -EIGEN_STRONG_INLINE Derived& DenseBase +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase ::lazyAssign(const DenseBase& other) { enum{ diff --git a/inst/include/Eigen/src/Core/AssignEvaluator.h b/inst/include/Eigen/src/Core/AssignEvaluator.h index dbe435d8..7d76f0c2 100644 --- a/inst/include/Eigen/src/Core/AssignEvaluator.h +++ b/inst/include/Eigen/src/Core/AssignEvaluator.h @@ -17,24 +17,24 @@ namespace Eigen { // This implementation is based on Assign.h namespace internal { - + /*************************************************************************** * Part 1 : the logic deciding a strategy for traversal and unrolling * ***************************************************************************/ // copy_using_evaluator_traits is based on assign_traits -template +template struct copy_using_evaluator_traits { typedef typename DstEvaluator::XprType Dst; typedef typename Dst::Scalar DstScalar; - + enum { DstFlags = DstEvaluator::Flags, SrcFlags = SrcEvaluator::Flags }; - + public: enum { DstAlignment = DstEvaluator::Alignment, @@ -51,13 +51,15 @@ struct copy_using_evaluator_traits InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), + RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize), + RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize), OuterStride = int(outer_stride_at_compile_time::ret), MaxSizeAtCompileTime = Dst::SizeAtCompileTime }; // TODO distinguish between linear traversal and inner-traversals - typedef typename find_best_packet::type LinearPacketType; - typedef typename find_best_packet::type InnerPacketType; + typedef typename find_best_packet::type LinearPacketType; + typedef typename find_best_packet::type InnerPacketType; enum { LinearPacketSize = unpacket_traits::size, @@ -97,7 +99,8 @@ struct copy_using_evaluator_traits public: enum { - Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + Traversal = int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time. + : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) @@ -135,7 +138,7 @@ struct copy_using_evaluator_traits ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) - ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) + ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) #if EIGEN_UNALIGNED_VECTORIZE : int(Traversal) == int(SliceVectorizedTraversal) @@ -172,6 +175,8 @@ struct copy_using_evaluator_traits EIGEN_DEBUG_VAR(MaySliceVectorize) std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime) EIGEN_DEBUG_VAR(UnrollingLimit) EIGEN_DEBUG_VAR(MayUnrollCompletely) EIGEN_DEBUG_VAR(MayUnrollInner) @@ -195,7 +200,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling // FIXME: this is not very clean, perhaps this information should be provided by the kernel? typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; - + enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime @@ -261,7 +266,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; - + enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime, @@ -312,6 +317,22 @@ template struct dense_assignment_loop; +/************************ +***** Special Cases ***** +************************/ + +// Zero-sized assignment is a no-op. +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT) + } +}; + /************************ *** Default traversal *** ************************/ @@ -426,10 +447,10 @@ struct dense_assignment_loop::size, - alignedSize = (size/packetSize)*packetSize }; + alignedSize = (int(size)/packetSize)*packetSize }; copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); copy_using_evaluator_DefaultTraversal_CompleteUnrolling::run(kernel); @@ -530,7 +551,7 @@ struct dense_assignment_loop const Scalar *dst_ptr = kernel.dstDataPtr(); if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0) { - // the pointer is not aligend-on scalar, so alignment is not possible + // the pointer is not aligned-on scalar, so alignment is not possible return dense_assignment_loop::run(kernel); } const Index packetAlignedMask = packetSize - 1; @@ -568,14 +589,15 @@ struct dense_assignment_loop typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; - enum { size = DstXprType::InnerSizeAtCompileTime, + enum { innerSize = DstXprType::InnerSizeAtCompileTime, packetSize =unpacket_traits::size, - vectorizableSize = (size/packetSize)*packetSize }; + vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize), + size = DstXprType::SizeAtCompileTime }; for(Index outer = 0; outer < kernel.outerSize(); ++outer) { copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); - copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); } } }; @@ -599,73 +621,74 @@ class generic_dense_assignment_kernel typedef typename DstEvaluatorTypeT::XprType DstXprType; typedef typename SrcEvaluatorTypeT::XprType SrcXprType; public: - + typedef DstEvaluatorTypeT DstEvaluatorType; typedef SrcEvaluatorTypeT SrcEvaluatorType; typedef typename DstEvaluatorType::Scalar Scalar; typedef copy_using_evaluator_traits AssignmentTraits; typedef typename AssignmentTraits::PacketType PacketType; - - - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) { #ifdef EIGEN_DEBUG_ASSIGN AssignmentTraits::debug(); #endif } - - EIGEN_DEVICE_FUNC Index size() const { return m_dstExpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_dstExpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_dstExpr.outerSize(); } - EIGEN_DEVICE_FUNC Index rows() const { return m_dstExpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_dstExpr.cols(); } - EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); } - - EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; } - EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; } - + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); } + + EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; } + EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; } + /// Assign src(row,col) to dst(row,col) through the assignment functor. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col) { m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col)); } - + /// \sa assignCoeff(Index,Index) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) { m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index)); } - + /// \sa assignCoeff(Index,Index) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) { - Index row = rowIndexByOuterInner(outer, inner); - Index col = colIndexByOuterInner(outer, inner); + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); assignCoeff(row, col); } - - + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) { m_functor.template assignPacket(&m_dst.coeffRef(row,col), m_src.template packet(row,col)); } - + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index) { m_functor.template assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); } - + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { - Index row = rowIndexByOuterInner(outer, inner); + Index row = rowIndexByOuterInner(outer, inner); Index col = colIndexByOuterInner(outer, inner); assignPacket(row, col); } - + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; @@ -688,7 +711,7 @@ class generic_dense_assignment_kernel { return m_dstExpr.data(); } - + protected: DstEvaluatorType& m_dst; const SrcEvaluatorType& m_src; @@ -697,6 +720,27 @@ class generic_dense_assignment_kernel DstXprType& m_dstExpr; }; +// Special kernel used when computing small products whose operands have dynamic dimensions. It ensures that the +// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used +// when computing the product. + +template +class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel +{ +protected: + typedef generic_dense_assignment_kernel Base; + public: + typedef typename Base::Scalar Scalar; + typedef typename Base::DstXprType DstXprType; + typedef copy_using_evaluator_traits AssignmentTraits; + typedef typename AssignmentTraits::PacketType PacketType; + + EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + : Base(dst, src, func, dstExpr) + { + } + }; + /*************************************************************************** * Part 5 : Entry point for dense rectangular assignment ***************************************************************************/ @@ -734,13 +778,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType resize_if_allowed(dst, src, func); DstEvaluatorType dstEvaluator(dst); - + typedef generic_dense_assignment_kernel Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); dense_assignment_loop::run(kernel); } +// Specialization for filling the destination with a constant value. +#ifndef EIGEN_GPU_COMPILE_PHASE +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp, DstXprType>& src, const internal::assign_op& func) +{ + resize_if_allowed(dst, src, func); + std::fill_n(dst.data(), dst.size(), src.functor()()); +} +#endif + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) { @@ -756,13 +810,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // AssignmentKind must define a Kind typedef. template struct AssignmentKind; -// Assignement kind defined in this file: +// Assignment kind defined in this file: struct Dense2Dense {}; struct EigenBase2EigenBase {}; template struct AssignmentKind { typedef EigenBase2EigenBase Kind; }; template<> struct AssignmentKind { typedef Dense2Dense Kind; }; - + // This is the main assignment class template< typename DstXprType, typename SrcXprType, typename Functor, typename Kind = typename AssignmentKind< typename evaluator_traits::Shape , typename evaluator_traits::Shape >::Kind, @@ -787,7 +841,7 @@ void call_assignment(const Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op()); } - + // Deal with "assume-aliasing" template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -827,14 +881,35 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) typedef typename internal::conditional, Dst>::type ActualDstTypeCleaned; typedef typename internal::conditional, Dst&>::type ActualDstType; ActualDstType actualDst(dst); - + // TODO check whether this is the right place to perform these checks: EIGEN_STATIC_ASSERT_LVALUE(Dst) EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src) EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar); - + Assignment::run(actualDst, src, func); } + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +{ + typedef evaluator DstEvaluatorType; + typedef evaluator SrcEvaluatorType; + typedef restricted_packet_dense_assignment_kernel Kernel; + + EIGEN_STATIC_ASSERT_LVALUE(Dst) + EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar); + + SrcEvaluatorType srcEvaluator(src); + resize_if_allowed(dst, src, func); + + DstEvaluatorType dstEvaluator(dst); + Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); + + dense_assignment_loop::run(kernel); +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment_no_alias(Dst& dst, const Src& src) @@ -875,7 +950,7 @@ struct Assignment #ifndef EIGEN_NO_DEBUG internal::check_for_aliasing(dst, src); #endif - + call_dense_assignment_loop(dst, src, func); } }; @@ -899,7 +974,7 @@ struct Assignment src.evalTo(dst); } - // NOTE The following two functions are templated to avoid their instanciation if not needed + // NOTE The following two functions are templated to avoid their instantiation if not needed // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. template EIGEN_DEVICE_FUNC diff --git a/inst/include/Eigen/src/Core/Assign_MKL.h b/inst/include/Eigen/src/Core/Assign_MKL.h index 6866095b..c6140d18 100755 --- a/inst/include/Eigen/src/Core/Assign_MKL.h +++ b/inst/include/Eigen/src/Core/Assign_MKL.h @@ -68,16 +68,16 @@ class vml_assign_traits #define EIGEN_PP_EXPAND(ARG) ARG #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1) -#define EIGEN_VMLMODE_EXPAND_LA , VML_HA +#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA #else -#define EIGEN_VMLMODE_EXPAND_LA , VML_LA +#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA #endif -#define EIGEN_VMLMODE_EXPAND__ +#define EIGEN_VMLMODE_EXPAND_x_ -#define EIGEN_VMLMODE_PREFIX_LA vm -#define EIGEN_VMLMODE_PREFIX__ v -#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE) +#define EIGEN_VMLMODE_PREFIX_xLA vm +#define EIGEN_VMLMODE_PREFIX_x_ v +#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE) #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE) \ template< typename DstXprType, typename SrcXprNested> \ @@ -89,7 +89,7 @@ class vml_assign_traits eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -97,7 +97,7 @@ class vml_assign_traits &(src.nestedExpression().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ } \ } \ } \ @@ -152,7 +152,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) if(vml_assign_traits::Traversal==LinearTraversal) \ { \ VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent, \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -160,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) &(src.lhs().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ } \ } \ } \ diff --git a/inst/include/Eigen/src/Core/BandMatrix.h b/inst/include/Eigen/src/Core/BandMatrix.h index 4978c914..878c0240 100644 --- a/inst/include/Eigen/src/Core/BandMatrix.h +++ b/inst/include/Eigen/src/Core/BandMatrix.h @@ -10,7 +10,7 @@ #ifndef EIGEN_BANDMATRIX_H #define EIGEN_BANDMATRIX_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -45,7 +45,7 @@ class BandMatrixBase : public EigenBase }; public: - + using Base::derived; using Base::rows; using Base::cols; @@ -55,10 +55,10 @@ class BandMatrixBase : public EigenBase /** \returns the number of sub diagonals */ inline Index subs() const { return derived().subs(); } - + /** \returns an expression of the underlying coefficient matrix */ inline const CoefficientsType& coeffs() const { return derived().coeffs(); } - + /** \returns an expression of the underlying coefficient matrix */ inline CoefficientsType& coeffs() { return derived().coeffs(); } @@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase * \warning the internal storage must be column major. */ inline Block col(Index i) { - EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); Index start = 0; Index len = coeffs().rows(); if (i<=supers()) @@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase template struct DiagonalIntReturnType { enum { - ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)), + ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)), Conjugate = ReturnOpposite && NumTraits::IsComplex, ActualIndex = ReturnOpposite ? -Index : Index, DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic) @@ -130,7 +130,7 @@ class BandMatrixBase : public EigenBase eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers())); return Block(coeffs(), supers()-i, std::max(0,i), 1, diagonalLength(i)); } - + template inline void evalTo(Dest& dst) const { dst.resize(rows(),cols()); @@ -192,7 +192,7 @@ struct traits > Options = _Options, DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic }; - typedef Matrix CoefficientsType; + typedef Matrix CoefficientsType; }; template @@ -211,16 +211,16 @@ class BandMatrix : public BandMatrixBase @@ -52,7 +52,7 @@ struct traits > : traits::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit, // FIXME DirectAccessBit should not be handled by expressions - // + // // Alignment is needed by MapBase's assertions // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator Alignment = 0 @@ -61,7 +61,7 @@ struct traits > : traits::ret> class BlockImpl_dense; - + } // end namespace internal template class BlockImpl; @@ -109,13 +109,13 @@ template class typedef Impl Base; EIGEN_GENERIC_PUBLIC_INTERFACE(Block) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) - + typedef typename internal::remove_all::type NestedExpression; - + /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index i) : Impl(xpr,i) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index i) : Impl(xpr,i) { eigen_assert( (i>=0) && ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i class /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) @@ -135,8 +135,8 @@ template class /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) @@ -147,7 +147,7 @@ template class && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols); } }; - + // The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense // that must be specialized for direct and non-direct access... template @@ -159,10 +159,10 @@ class BlockImpl public: typedef Impl Base; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} EIGEN_DEVICE_FUNC - inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) {} }; @@ -294,25 +294,25 @@ template::type& nestedExpression() const - { - return m_xpr; + { + return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - - EIGEN_DEVICE_FUNC - StorageIndex startRow() const - { - return m_startRow.value(); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startRow() const EIGEN_NOEXCEPT + { + return m_startRow.value(); } - - EIGEN_DEVICE_FUNC - StorageIndex startCol() const - { - return m_startCol.value(); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startCol() const EIGEN_NOEXCEPT + { + return m_startCol.value(); } protected: @@ -342,9 +342,9 @@ class BlockImpl_dense /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index i) - : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index i) + : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), @@ -357,8 +357,8 @@ class BlockImpl_dense /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { @@ -367,8 +367,8 @@ class BlockImpl_dense /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), @@ -377,18 +377,18 @@ class BlockImpl_dense init(); } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& nestedExpression() const - { - return m_xpr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& nestedExpression() const EIGEN_NOEXCEPT + { + return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - + /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index innerStride() const EIGEN_NOEXCEPT { return internal::traits::HasSameStorageOrderAsXprType ? m_xpr.innerStride() @@ -396,23 +396,19 @@ class BlockImpl_dense } /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index outerStride() const EIGEN_NOEXCEPT { - return m_outerStride; + return internal::traits::HasSameStorageOrderAsXprType + ? m_xpr.outerStride() + : m_xpr.innerStride(); } - EIGEN_DEVICE_FUNC - StorageIndex startRow() const - { - return m_startRow.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); } - EIGEN_DEVICE_FUNC - StorageIndex startCol() const - { - return m_startCol.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); } #ifndef __SUNPRO_CC // FIXME sunstudio is not friendly with the above friend... @@ -422,8 +418,8 @@ class BlockImpl_dense #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal used by allowAligned() */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) : Base(data, blockRows, blockCols), m_xpr(xpr) { init(); @@ -431,7 +427,7 @@ class BlockImpl_dense #endif protected: - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void init() { m_outerStride = internal::traits::HasSameStorageOrderAsXprType diff --git a/inst/include/Eigen/src/Core/BooleanRedux.h b/inst/include/Eigen/src/Core/BooleanRedux.h index 8409d874..852de8b9 100644 --- a/inst/include/Eigen/src/Core/BooleanRedux.h +++ b/inst/include/Eigen/src/Core/BooleanRedux.h @@ -14,58 +14,56 @@ namespace Eigen { namespace internal { -template +template struct all_unroller { - typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Traits::RowsAtCompileTime, - row = (UnrollCount-1) % Traits::RowsAtCompileTime + col = (UnrollCount-1) / Rows, + row = (UnrollCount-1) % Rows }; - static inline bool run(const Derived &mat) + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(row, col); } }; -template -struct all_unroller +template +struct all_unroller { - static inline bool run(const Derived &/*mat*/) { return true; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { - static inline bool run(const Derived &) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { - typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Traits::RowsAtCompileTime, - row = (UnrollCount-1) % Traits::RowsAtCompileTime + col = (UnrollCount-1) / Rows, + row = (UnrollCount-1) % Rows }; - static inline bool run(const Derived &mat) + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(row, col); } }; -template -struct any_unroller +template +struct any_unroller { - static inline bool run(const Derived & /*mat*/) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { - static inline bool run(const Derived &) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; } // end namespace internal @@ -78,16 +76,16 @@ struct any_unroller * \sa any(), Cwise::operator<() */ template -inline bool DenseBase::all() const +EIGEN_DEVICE_FUNC inline bool DenseBase::all() const { typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::run(evaluator); + return internal::all_unroller::RowsAtCompileTime>::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -102,16 +100,16 @@ inline bool DenseBase::all() const * \sa all() */ template -inline bool DenseBase::any() const +EIGEN_DEVICE_FUNC inline bool DenseBase::any() const { typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::run(evaluator); + return internal::any_unroller::RowsAtCompileTime>::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -126,7 +124,7 @@ inline bool DenseBase::any() const * \sa all(), any() */ template -inline Eigen::Index DenseBase::count() const +EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const { return derived().template cast().template cast().sum(); } diff --git a/inst/include/Eigen/src/Core/CommaInitializer.h b/inst/include/Eigen/src/Core/CommaInitializer.h index d218e981..c0e29c75 100644 --- a/inst/include/Eigen/src/Core/CommaInitializer.h +++ b/inst/include/Eigen/src/Core/CommaInitializer.h @@ -33,6 +33,8 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const Scalar& s) : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1) { + eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0 + && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.coeffRef(0,0) = s; } @@ -41,6 +43,8 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const DenseBase& other) : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows()) { + eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols() + && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.block(0, 0, other.rows(), other.cols()) = other; } @@ -103,7 +107,7 @@ struct CommaInitializer EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception) #endif { - finished(); + finished(); } /** \returns the built matrix once all its coefficients have been set. @@ -141,7 +145,7 @@ struct CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer */ template -inline CommaInitializer DenseBase::operator<< (const Scalar& s) +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<< (const Scalar& s) { return CommaInitializer(*static_cast(this), s); } @@ -149,7 +153,7 @@ inline CommaInitializer DenseBase::operator<< (const Scalar& s /** \sa operator<<(const Scalar&) */ template template -inline CommaInitializer +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<<(const DenseBase& other) { return CommaInitializer(*static_cast(this), other); diff --git a/inst/include/Eigen/src/Core/CoreEvaluators.h b/inst/include/Eigen/src/Core/CoreEvaluators.h index 910889ef..0ff8c8de 100644 --- a/inst/include/Eigen/src/Core/CoreEvaluators.h +++ b/inst/include/Eigen/src/Core/CoreEvaluators.h @@ -14,7 +14,7 @@ #define EIGEN_COREEVALUATORS_H namespace Eigen { - + namespace internal { // This class returns the evaluator kind from the expression storage kind. @@ -63,8 +63,8 @@ template< typename T, template< typename T, typename Kind = typename evaluator_traits::Kind, typename Scalar = typename T::Scalar> struct unary_evaluator; - -// evaluator_traits contains traits for evaluator + +// evaluator_traits contains traits for evaluator template struct evaluator_traits_base @@ -90,7 +90,8 @@ template struct evaluator : public unary_evaluator { typedef unary_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const T& xpr) : Base(xpr) {} }; @@ -99,21 +100,29 @@ template struct evaluator : evaluator { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator(xpr) {} }; // ---------- base class for all evaluators ---------- template -struct evaluator_base : public noncopyable +struct evaluator_base { // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices. typedef traits ExpressionTraits; - + enum { Alignment = 0 }; + // noncopyable: + // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization) + // and make complex evaluator much larger than then should do. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {} +private: + EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&); + EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&); }; // -------------------- Matrix and Array -------------------- @@ -123,6 +132,33 @@ struct evaluator_base : public noncopyable // Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense, // so no need for more sophisticated dispatching. +// this helper permits to completely eliminate m_outerStride if it is known at compiletime. +template class plainobjectbase_evaluator_data { +public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) + { +#ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(outerStride); +#endif + eigen_internal_assert(outerStride==OuterStride); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; } + const Scalar *data; +}; + +template class plainobjectbase_evaluator_data { +public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return m_outerStride; } + const Scalar *data; +protected: + Index m_outerStride; +}; + template struct evaluator > : evaluator_base @@ -136,23 +172,28 @@ struct evaluator > IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime, RowsAtCompileTime = PlainObjectType::RowsAtCompileTime, ColsAtCompileTime = PlainObjectType::ColsAtCompileTime, - + CoeffReadCost = NumTraits::ReadCost, Flags = traits::EvaluatorFlags, Alignment = traits::Alignment }; - - EIGEN_DEVICE_FUNC evaluator() - : m_data(0), - m_outerStride(IsVectorAtCompileTime ? 0 - : int(IsRowMajor) ? ColsAtCompileTime - : RowsAtCompileTime) + enum { + // We do not need to know the outer stride for vectors + OuterStrideAtCompileTime = IsVectorAtCompileTime ? 0 + : int(IsRowMajor) ? ColsAtCompileTime + : RowsAtCompileTime + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() + : m_d(0,OuterStrideAtCompileTime) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - - EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m) - : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const PlainObjectType& m) + : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -161,30 +202,30 @@ struct evaluator > CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) - return m_data[row * m_outerStride.value() + col]; + return m_d.data[row * m_d.outerStride() + col]; else - return m_data[row + col * m_outerStride.value()]; + return m_d.data[row + col * m_d.outerStride()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_data[index]; + return m_d.data[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) - return const_cast(m_data)[row * m_outerStride.value() + col]; + return const_cast(m_d.data)[row * m_d.outerStride() + col]; else - return const_cast(m_data)[row + col * m_outerStride.value()]; + return const_cast(m_d.data)[row + col * m_d.outerStride()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return const_cast(m_data)[index]; + return const_cast(m_d.data)[index]; } template @@ -192,16 +233,16 @@ struct evaluator > PacketType packet(Index row, Index col) const { if (IsRowMajor) - return ploadt(m_data + row * m_outerStride.value() + col); + return ploadt(m_d.data + row * m_d.outerStride() + col); else - return ploadt(m_data + row + col * m_outerStride.value()); + return ploadt(m_d.data + row + col * m_d.outerStride()); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return ploadt(m_data + index); + return ploadt(m_d.data + index); } template @@ -210,26 +251,22 @@ struct evaluator > { if (IsRowMajor) return pstoret - (const_cast(m_data) + row * m_outerStride.value() + col, x); + (const_cast(m_d.data) + row * m_d.outerStride() + col, x); else return pstoret - (const_cast(m_data) + row + col * m_outerStride.value(), x); + (const_cast(m_d.data) + row + col * m_d.outerStride(), x); } template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return pstoret(const_cast(m_data) + index, x); + return pstoret(const_cast(m_d.data) + index, x); } protected: - const Scalar *m_data; - // We do not need to know the outer stride for vectors - variable_if_dynamic m_outerStride; + plainobjectbase_evaluator_data m_d; }; template @@ -237,11 +274,13 @@ struct evaluator > : evaluator > > { typedef Matrix XprType; - - EIGEN_DEVICE_FUNC evaluator() {} - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) - : evaluator >(m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) + : evaluator >(m) { } }; @@ -251,10 +290,12 @@ struct evaluator > { typedef Array XprType; - EIGEN_DEVICE_FUNC evaluator() {} - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) - : evaluator >(m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) + : evaluator >(m) { } }; @@ -265,14 +306,15 @@ struct unary_evaluator, IndexBased> : evaluator_base > { typedef Transpose XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost, + CoeffReadCost = evaluator::CoeffReadCost, Flags = evaluator::Flags ^ RowMajorBit, Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -457,10 +499,10 @@ struct evaluator > { typedef CwiseNullaryOp XprType; typedef typename internal::remove_all::type PlainObjectTypeCleaned; - + enum { CoeffReadCost = internal::functor_traits::Cost, - + Flags = (evaluator::Flags & ( HereditaryBits | (functor_has_linear_access::ret ? LinearAccessBit : 0) @@ -517,19 +559,17 @@ struct unary_evaluator, IndexBased > : evaluator_base > { typedef CwiseUnaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = evaluator::Flags & (HereditaryBits | LinearAccessBit | (functor_traits::PacketAccess ? PacketAccessBit : 0)), Alignment = evaluator::Alignment }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& op) - : m_functor(op.functor()), - m_argImpl(op.nestedExpression()) + explicit unary_evaluator(const XprType& op) : m_d(op) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -540,32 +580,43 @@ struct unary_evaluator, IndexBased > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_argImpl.coeff(row, col)); + return m_d.func()(m_d.argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_argImpl.coeff(index)); + return m_d.func()(m_d.argImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_argImpl.template packet(row, col)); + return m_d.func().packetOp(m_d.argImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_argImpl.template packet(index)); + return m_d.func().packetOp(m_d.argImpl.template packet(index)); } protected: - const UnaryOp m_functor; - evaluator m_argImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const UnaryOp& func() const { return op; } + UnaryOp op; + evaluator argImpl; + }; + + Data m_d; }; // -------------------- CwiseTernaryOp -------------------- @@ -577,7 +628,7 @@ struct evaluator > { typedef CwiseTernaryOp XprType; typedef ternary_evaluator > Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} }; @@ -586,10 +637,10 @@ struct ternary_evaluator, IndexBased : evaluator_base > { typedef CwiseTernaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Arg1Flags = evaluator::Flags, Arg2Flags = evaluator::Flags, Arg3Flags = evaluator::Flags, @@ -609,11 +660,7 @@ struct ternary_evaluator, IndexBased evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) - : m_functor(xpr.functor()), - m_arg1Impl(xpr.arg1()), - m_arg2Impl(xpr.arg2()), - m_arg3Impl(xpr.arg3()) + EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -624,38 +671,48 @@ struct ternary_evaluator, IndexBased EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col)); + return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_arg1Impl.template packet(row, col), - m_arg2Impl.template packet(row, col), - m_arg3Impl.template packet(row, col)); + return m_d.func().packetOp(m_d.arg1Impl.template packet(row, col), + m_d.arg2Impl.template packet(row, col), + m_d.arg3Impl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_arg1Impl.template packet(index), - m_arg2Impl.template packet(index), - m_arg3Impl.template packet(index)); + return m_d.func().packetOp(m_d.arg1Impl.template packet(index), + m_d.arg2Impl.template packet(index), + m_d.arg3Impl.template packet(index)); } protected: - const TernaryOp m_functor; - evaluator m_arg1Impl; - evaluator m_arg2Impl; - evaluator m_arg3Impl; + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TernaryOp& func() const { return op; } + TernaryOp op; + evaluator arg1Impl; + evaluator arg2Impl; + evaluator arg3Impl; + }; + + Data m_d; }; // -------------------- CwiseBinaryOp -------------------- @@ -667,8 +724,9 @@ struct evaluator > { typedef CwiseBinaryOp XprType; typedef binary_evaluator > Base; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& xpr) : Base(xpr) {} }; template @@ -676,10 +734,10 @@ struct binary_evaluator, IndexBased, IndexBase : evaluator_base > { typedef CwiseBinaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + LhsFlags = evaluator::Flags, RhsFlags = evaluator::Flags, SameType = is_same::value, @@ -696,10 +754,8 @@ struct binary_evaluator, IndexBased, IndexBase Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment,evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) - : m_functor(xpr.functor()), - m_lhsImpl(xpr.lhs()), - m_rhsImpl(xpr.rhs()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit binary_evaluator(const XprType& xpr) : m_d(xpr) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -710,35 +766,46 @@ struct binary_evaluator, IndexBased, IndexBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); + return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); + return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_lhsImpl.template packet(row, col), - m_rhsImpl.template packet(row, col)); + return m_d.func().packetOp(m_d.lhsImpl.template packet(row, col), + m_d.rhsImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_lhsImpl.template packet(index), - m_rhsImpl.template packet(index)); + return m_d.func().packetOp(m_d.lhsImpl.template packet(index), + m_d.rhsImpl.template packet(index)); } protected: - const BinaryOp m_functor; - evaluator m_lhsImpl; - evaluator m_rhsImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const BinaryOp& func() const { return op; } + BinaryOp op; + evaluator lhsImpl; + evaluator rhsImpl; + }; + + Data m_d; }; // -------------------- CwiseUnaryView -------------------- @@ -748,18 +815,16 @@ struct unary_evaluator, IndexBased> : evaluator_base > { typedef CwiseUnaryView XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = (evaluator::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)), - + Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost... }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) - : m_unaryOp(op.functor()), - m_argImpl(op.nestedExpression()) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -771,30 +836,41 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_unaryOp(m_argImpl.coeff(row, col)); + return m_d.func()(m_d.argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_unaryOp(m_argImpl.coeff(index)); + return m_d.func()(m_d.argImpl.coeff(index)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { - return m_unaryOp(m_argImpl.coeffRef(row, col)); + return m_d.func()(m_d.argImpl.coeffRef(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return m_unaryOp(m_argImpl.coeffRef(index)); + return m_d.func()(m_d.argImpl.coeffRef(index)); } protected: - const UnaryOp m_unaryOp; - evaluator m_argImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const UnaryOp& func() const { return op; } + UnaryOp op; + evaluator argImpl; + }; + + Data m_d; }; // -------------------- Map -------------------- @@ -811,14 +887,15 @@ struct mapbase_evaluator : evaluator_base typedef typename XprType::PointerType PointerType; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - + enum { IsRowMajor = XprType::RowsAtCompileTime, ColsAtCompileTime = XprType::ColsAtCompileTime, CoeffReadCost = NumTraits::ReadCost }; - EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit mapbase_evaluator(const XprType& map) : m_data(const_cast(map.data())), m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) @@ -882,17 +959,21 @@ struct mapbase_evaluator : evaluator_base internal::pstoret(m_data + index * m_innerStride.value(), x); } protected: - EIGEN_DEVICE_FUNC - inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } - EIGEN_DEVICE_FUNC - inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowStride() const EIGEN_NOEXCEPT { + return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colStride() const EIGEN_NOEXCEPT { + return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); + } PointerType m_data; const internal::variable_if_dynamic m_innerStride; const internal::variable_if_dynamic m_outerStride; }; -template +template struct evaluator > : public mapbase_evaluator, PlainObjectType> { @@ -900,7 +981,7 @@ struct evaluator > typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types typedef typename packet_traits::type PacketScalar; - + enum { InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) @@ -912,34 +993,35 @@ struct evaluator > HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0, HasNoStride = HasNoInnerStride && HasNoOuterStride, IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic, - + PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit), LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit), Flags = int( evaluator::Flags) & (LinearAccessMask&PacketAccessMask), - + Alignment = int(MapOptions)&int(AlignedMask) }; EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) - : mapbase_evaluator(map) + : mapbase_evaluator(map) { } }; // -------------------- Ref -------------------- -template +template struct evaluator > : public mapbase_evaluator, PlainObjectType> { typedef Ref XprType; - + enum { Flags = evaluator >::Flags, Alignment = evaluator >::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref) - : mapbase_evaluator(ref) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& ref) + : mapbase_evaluator(ref) { } }; @@ -947,8 +1029,8 @@ struct evaluator > template::ret> struct block_evaluator; - -template + +template struct evaluator > : block_evaluator { @@ -956,15 +1038,15 @@ struct evaluator > typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types typedef typename packet_traits::type PacketScalar; - + enum { CoeffReadCost = evaluator::CoeffReadCost, - + RowsAtCompileTime = traits::RowsAtCompileTime, ColsAtCompileTime = traits::ColsAtCompileTime, MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = traits::MaxColsAtCompileTime, - + ArgTypeIsRowMajor = (int(evaluator::Flags)&RowMajorBit) != 0, IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1 : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 @@ -978,14 +1060,14 @@ struct evaluator > ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, - - FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, + + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, Flags0 = evaluator::Flags & ( (HereditaryBits & ~RowMajorBit) | DirectAccessBit | MaskPacketAccessBit), Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, - + PacketAlignment = unpacket_traits::alignment, Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (OuterStrideAtCompileTime!=0) @@ -993,7 +1075,8 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& block) : block_evaluator_type(block) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1006,8 +1089,9 @@ struct block_evaluator XprType; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) - : unary_evaluator(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) + : unary_evaluator(block) {} }; @@ -1017,79 +1101,74 @@ struct unary_evaluator, IndexBa { typedef Block XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) - : m_argImpl(block.nestedExpression()), - m_startRow(block.startRow()), + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& block) + : m_argImpl(block.nestedExpression()), + m_startRow(block.startRow()), m_startCol(block.startCol()), - m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) + m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0) { } - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { RowsAtCompileTime = XprType::RowsAtCompileTime, - ForwardLinearAccess = InnerPanel && bool(evaluator::Flags&LinearAccessBit) + ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator::Flags&LinearAccessBit) }; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const - { - return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); + { + return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if (ForwardLinearAccess) - return m_argImpl.coeff(m_linear_offset.value() + index); - else - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + { + return linear_coeff_impl(index, bool_constant()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) - { - return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); + { + return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - if (ForwardLinearAccess) - return m_argImpl.coeffRef(m_linear_offset.value() + index); - else - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + { + return linear_coeffRef_impl(index, bool_constant()); } - + template EIGEN_STRONG_INLINE - PacketType packet(Index row, Index col) const - { - return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); + PacketType packet(Index row, Index col) const + { + return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); } template EIGEN_STRONG_INLINE - PacketType packet(Index index) const - { + PacketType packet(Index index) const + { if (ForwardLinearAccess) return m_argImpl.template packet(m_linear_offset.value() + index); else return packet(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - + template EIGEN_STRONG_INLINE - void writePacket(Index row, Index col, const PacketType& x) + void writePacket(Index row, Index col, const PacketType& x) { - return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); + return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); } - + template EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketType& x) + void writePacket(Index index, const PacketType& x) { if (ForwardLinearAccess) return m_argImpl.template writePacket(m_linear_offset.value() + index, x); @@ -1098,18 +1177,40 @@ struct unary_evaluator, IndexBa RowsAtCompileTime == 1 ? index : 0, x); } - + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const + { + return m_argImpl.coeff(m_linear_offset.value() + index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const + { + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */) + { + return m_argImpl.coeffRef(m_linear_offset.value() + index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */) + { + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; - const variable_if_dynamic m_linear_offset; + const variable_if_dynamic m_linear_offset; }; -// TODO: This evaluator does not actually use the child evaluator; +// TODO: This evaluator does not actually use the child evaluator; // all action is via the data() as returned by the Block expression. -template +template struct block_evaluator : mapbase_evaluator, typename Block::PlainObject> @@ -1117,8 +1218,9 @@ struct block_evaluator XprType; typedef typename XprType::Scalar Scalar; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) - : mapbase_evaluator(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) + : mapbase_evaluator(block) { // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); @@ -1141,18 +1243,19 @@ struct evaluator > evaluator::CoeffReadCost), Flags = (unsigned int)evaluator::Flags & evaluator::Flags & HereditaryBits, - + Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - + typedef typename XprType::CoeffReturnType CoeffReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1172,7 +1275,7 @@ struct evaluator > else return m_elseImpl.coeff(index); } - + protected: evaluator m_conditionImpl; evaluator m_thenImpl; @@ -1182,7 +1285,7 @@ struct evaluator > // -------------------- Replicate -------------------- -template +template struct unary_evaluator > : evaluator_base > { @@ -1193,22 +1296,23 @@ struct unary_evaluator > }; typedef typename internal::nested_eval::type ArgTypeNested; typedef typename internal::remove_all::type ArgTypeNestedCleaned; - + enum { CoeffReadCost = evaluator::CoeffReadCost, LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0, Flags = (evaluator::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits::Flags & RowMajorBit), - + Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& replicate) : m_arg(replicate.nestedExpression()), m_argImpl(m_arg), m_rows(replicate.nestedExpression().rows()), m_cols(replicate.nestedExpression().cols()) {} - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { @@ -1219,10 +1323,10 @@ struct unary_evaluator > const Index actual_col = internal::traits::ColsAtCompileTime==1 ? 0 : ColFactor==1 ? col : col % m_cols.value(); - + return m_argImpl.coeff(actual_row, actual_col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -1230,7 +1334,7 @@ struct unary_evaluator > const Index actual_index = internal::traits::RowsAtCompileTime==1 ? (ColFactor==1 ? index : index%m_cols.value()) : (RowFactor==1 ? index : index%m_rows.value()); - + return m_argImpl.coeff(actual_index); } @@ -1247,7 +1351,7 @@ struct unary_evaluator > return m_argImpl.template packet(actual_row, actual_col); } - + template EIGEN_STRONG_INLINE PacketType packet(Index index) const @@ -1258,7 +1362,7 @@ struct unary_evaluator > return m_argImpl.template packet(actual_index); } - + protected: const ArgTypeNested m_arg; evaluator m_argImpl; @@ -1266,64 +1370,6 @@ struct unary_evaluator > const variable_if_dynamic m_cols; }; - -// -------------------- PartialReduxExpr -------------------- - -template< typename ArgType, typename MemberOp, int Direction> -struct evaluator > - : evaluator_base > -{ - typedef PartialReduxExpr XprType; - typedef typename internal::nested_eval::type ArgTypeNested; - typedef typename internal::remove_all::type ArgTypeNestedCleaned; - typedef typename ArgType::Scalar InputScalar; - typedef typename XprType::Scalar Scalar; - enum { - TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) - }; - typedef typename MemberOp::template Cost CostOpType; - enum { - CoeffReadCost = TraversalSize==Dynamic ? HugeCost - : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - - Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, - - Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized - }; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) - : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value)); - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index i, Index j) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(j)); - else - return m_functor(m_arg.row(i)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index index) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(index)); - else - return m_functor(m_arg.row(index)); - } - -protected: - typename internal::add_const_on_value_type::type m_arg; - const MemberOp m_functor; -}; - - // -------------------- MatrixWrapper and ArrayWrapper -------------------- // // evaluator_wrapper_base is a common base class for the @@ -1340,7 +1386,8 @@ struct evaluator_wrapper_base Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; @@ -1407,7 +1454,8 @@ struct unary_evaluator > { typedef MatrixWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1418,7 +1466,8 @@ struct unary_evaluator > { typedef ArrayWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1445,9 +1494,9 @@ struct unary_evaluator > ReversePacket = (Direction == BothDirections) || ((Direction == Vertical) && IsColMajor) || ((Direction == Horizontal) && IsRowMajor), - + CoeffReadCost = evaluator::CoeffReadCost, - + // let's enable LinearAccess only with vectorization because of the product overhead // FIXME enable DirectAccess with negative strides? Flags0 = evaluator::Flags, @@ -1456,16 +1505,17 @@ struct unary_evaluator > ? LinearAccessBit : 0, Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess), - + Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f. }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& reverse) : m_argImpl(reverse.nestedExpression()), m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1), m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { @@ -1540,7 +1590,7 @@ struct unary_evaluator > m_argImpl.template writePacket (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); } - + protected: evaluator m_argImpl; @@ -1558,20 +1608,21 @@ struct evaluator > : evaluator_base > { typedef Diagonal XprType; - + enum { CoeffReadCost = evaluator::CoeffReadCost, - + Flags = (unsigned int)(evaluator::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit, - + Alignment = 0 }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& diagonal) : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) { } - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -1604,8 +1655,10 @@ struct evaluator > const internal::variable_if_dynamicindex m_index; private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } }; @@ -1629,25 +1682,25 @@ class EvalToTemp : public dense_xpr_base >::type { public: - + typedef typename dense_xpr_base::type Base; EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp) - + explicit EvalToTemp(const ArgType& arg) : m_arg(arg) { } - + const ArgType& arg() const { return m_arg; } - Index rows() const + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_arg.rows(); } - Index cols() const + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_arg.cols(); } @@ -1655,7 +1708,7 @@ class EvalToTemp private: const ArgType& m_arg; }; - + template struct evaluator > : public evaluator @@ -1663,7 +1716,7 @@ struct evaluator > typedef EvalToTemp XprType; typedef typename ArgType::PlainObject PlainObject; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) { diff --git a/inst/include/Eigen/src/Core/CoreIterators.h b/inst/include/Eigen/src/Core/CoreIterators.h index 4eb42b93..b9671968 100644 --- a/inst/include/Eigen/src/Core/CoreIterators.h +++ b/inst/include/Eigen/src/Core/CoreIterators.h @@ -48,6 +48,11 @@ class InnerIterator * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView */ EIGEN_STRONG_INLINE InnerIterator& operator++() { m_iter.operator++(); return *this; } + EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; } + EIGEN_STRONG_INLINE InnerIterator operator+(Index i) + { InnerIterator result(*this); result+=i; return result; } + + /// \returns the column or row index of the current coefficient. EIGEN_STRONG_INLINE Index index() const { return m_iter.index(); } /// \returns the row index of the current coefficient. diff --git a/inst/include/Eigen/src/Core/CwiseBinaryOp.h b/inst/include/Eigen/src/Core/CwiseBinaryOp.h index a36765e3..2202b1cc 100644 --- a/inst/include/Eigen/src/Core/CwiseBinaryOp.h +++ b/inst/include/Eigen/src/Core/CwiseBinaryOp.h @@ -74,7 +74,7 @@ class CwiseBinaryOpImpl; * \sa MatrixBase::binaryExpr(const MatrixBase &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp */ template -class CwiseBinaryOp : +class CwiseBinaryOp : public CwiseBinaryOpImpl< BinaryOp, LhsType, RhsType, typename internal::cwise_promote_storage_type::StorageKind, @@ -83,7 +83,7 @@ class CwiseBinaryOp : internal::no_assignment_operator { public: - + typedef typename internal::remove_all::type Functor; typedef typename internal::remove_all::type Lhs; typedef typename internal::remove_all::type Rhs; @@ -100,8 +100,14 @@ class CwiseBinaryOp : typedef typename internal::remove_reference::type _LhsNested; typedef typename internal::remove_reference::type _RhsNested; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) +#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 + //Required for Visual Studio or the Copy constructor will probably not get inlined! + EIGEN_STRONG_INLINE + CwiseBinaryOp(const CwiseBinaryOp&) = default; +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) { EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); @@ -110,31 +116,25 @@ class CwiseBinaryOp : eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - if (internal::traits::type>::RowsAtCompileTime==Dynamic) - return m_rhs.rows(); - else - return m_lhs.rows(); + return internal::traits::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - if (internal::traits::type>::ColsAtCompileTime==Dynamic) - return m_rhs.cols(); - else - return m_lhs.cols(); + return internal::traits::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols(); } /** \returns the left hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; } /** \returns the right hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; } /** \returns the functor representing the binary operation */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; } protected: @@ -158,7 +158,7 @@ class CwiseBinaryOpImpl */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator-=(const MatrixBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -171,7 +171,7 @@ MatrixBase::operator-=(const MatrixBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator+=(const MatrixBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -181,4 +181,3 @@ MatrixBase::operator+=(const MatrixBase& other) } // end namespace Eigen #endif // EIGEN_CWISE_BINARY_OP_H - diff --git a/inst/include/Eigen/src/Core/CwiseNullaryOp.h b/inst/include/Eigen/src/Core/CwiseNullaryOp.h index ddd607e3..289ec510 100644 --- a/inst/include/Eigen/src/Core/CwiseNullaryOp.h +++ b/inst/include/Eigen/src/Core/CwiseNullaryOp.h @@ -74,10 +74,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -126,12 +131,17 @@ DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f * * Here is an example with C++11 random generators: \include random_cpp11.cpp * Output: \verbinclude random_cpp11.out - * + * * \sa class CwiseNullaryOp */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -150,7 +160,12 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -170,7 +185,7 @@ DenseBase::NullaryExpr(const CustomNullaryOp& func) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index rows, Index cols, const Scalar& value) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); @@ -217,27 +232,32 @@ DenseBase::Constant(const Scalar& value) /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&) * - * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) + * \only_for_vectors + * + * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp + * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out + * + * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&) * - * \sa LinSpaced(Scalar,Scalar) + * \sa LinSpaced(const Scalar&, const Scalar&) */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** @@ -268,7 +288,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomA DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** @@ -281,7 +301,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ @@ -363,6 +383,33 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) return setConstant(val); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to the given value \a val. For the parameter + * of type NoChange_t, just pass the special value \c NoChange. + * + * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(NoChange_t, Index cols, const Scalar& val) +{ + return setConstant(rows(), cols, val); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to the given value \a val. For the parameter + * of type NoChange_t, just pass the special value \c NoChange. + * + * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(Index rows, NoChange_t, const Scalar& val) +{ + return setConstant(rows, cols(), val); +} + + /** * \brief Sets a linearly spaced vector. * @@ -383,7 +430,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); + return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); } /** @@ -536,6 +583,32 @@ PlainObjectBase::setZero(Index rows, Index cols) return setConstant(Scalar(0)); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to zero. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(NoChange_t, Index cols) +{ + return setZero(rows(), cols); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to zero. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(Index rows, NoChange_t) +{ + return setZero(rows, cols()); +} + // ones: /** \returns an expression of a matrix where all coefficients equal one. @@ -662,6 +735,32 @@ PlainObjectBase::setOnes(Index rows, Index cols) return setConstant(Scalar(1)); } +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to one. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(Index rows, NoChange_t) +{ + return setOnes(rows, cols()); +} + +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to one. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(NoChange_t, Index cols) +{ + return setOnes(rows(), cols); +} + // Identity: /** \returns an expression of the identity matrix (not necessarily square). @@ -861,6 +960,42 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } +/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector + * + * \param i index of the unique coefficient to be set to 1 + * + * \only_for_vectors + * + * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index newSize, Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i @@ -24,7 +24,7 @@ struct traits > typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; enum { - Flags = _XprTypeNested::Flags & RowMajorBit + Flags = _XprTypeNested::Flags & RowMajorBit }; }; } @@ -65,10 +65,10 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::non_const_type MatrixTypeNested; typedef typename internal::remove_all::type NestedExpression; - explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) + explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) : m_matrix(mat), m_functor(func) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView) - EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); } - EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \returns the functor representing unary operation */ - const ViewOp& functor() const { return m_functor; } + EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; } /** \returns the nested expression */ - const typename internal::remove_all::type& + EIGEN_DEVICE_FUNC const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - typename internal::remove_reference::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + EIGEN_DEVICE_FUNC typename internal::remove_reference::type& + nestedExpression() { return m_matrix; } protected: MatrixTypeNested m_matrix; @@ -108,16 +110,16 @@ class CwiseUnaryViewImpl EIGEN_DENSE_PUBLIC_INTERFACE(Derived) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl) - + EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); } - EIGEN_DEVICE_FUNC inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return derived().nestedExpression().innerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } - EIGEN_DEVICE_FUNC inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return derived().nestedExpression().outerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } diff --git a/inst/include/Eigen/src/Core/DenseBase.h b/inst/include/Eigen/src/Core/DenseBase.h index c55a6823..9b16db68 100644 --- a/inst/include/Eigen/src/Core/DenseBase.h +++ b/inst/include/Eigen/src/Core/DenseBase.h @@ -14,15 +14,15 @@ namespace Eigen { namespace internal { - + // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type. // This dummy function simply aims at checking that at compile time. static inline void check_DenseIndex_is_signed() { - EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); + EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE) } } // end namespace internal - + /** \class DenseBase * \ingroup Core_Module * @@ -64,12 +64,12 @@ template class DenseBase /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex, etc. */ typedef typename internal::traits::Scalar Scalar; - + /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex, etc. * * It is an alias for the Scalar type */ typedef Scalar value_type; - + typedef typename NumTraits::Real RealScalar; typedef DenseCoeffsBase::value> Base; @@ -150,13 +150,18 @@ template class DenseBase * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime */ - IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1, + IsVectorAtCompileTime = internal::traits::RowsAtCompileTime == 1 + || internal::traits::ColsAtCompileTime == 1, /**< This is set to true if either the number of rows or the number of * columns is known at compile-time to be equal to 1. Indeed, in that case, * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, + /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, + * and 2 for matrices. + */ + Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -170,11 +175,11 @@ template class DenseBase InnerStrideAtCompileTime = internal::inner_stride_at_compile_time::ret, OuterStrideAtCompileTime = internal::outer_stride_at_compile_time::ret }; - + typedef typename internal::find_best_packet::type PacketScalar; enum { IsPlainObjectBase = 0 }; - + /** The plain matrix type corresponding to this expression. * \sa PlainObject */ typedef Matrix::Scalar, @@ -184,7 +189,7 @@ template class DenseBase internal::traits::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime > PlainMatrix; - + /** The plain array type corresponding to this expression. * \sa PlainObject */ typedef Array::Scalar, @@ -206,7 +211,7 @@ template class DenseBase /** \returns the number of nonzero coefficients which is in practice the number * of stored coefficients. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index nonZeros() const { return size(); } /** \returns the outer size. @@ -214,7 +219,7 @@ template class DenseBase * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a * column-major matrix, and the number of rows for a row-major matrix. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const { return IsVectorAtCompileTime ? 1 @@ -224,9 +229,9 @@ template class DenseBase /** \returns the inner size. * * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension - * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a + * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a * column-major matrix, and the number of columns for a row-major matrix. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const { return IsVectorAtCompileTime ? this->size() @@ -261,9 +266,9 @@ template class DenseBase /** \internal Represents a matrix with all coefficients equal to one another*/ typedef CwiseNullaryOp,PlainObject> ConstantReturnType; /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */ - typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; + EIGEN_DEPRECATED typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; /** \internal Represents a vector with linearly spaced coefficients that allows random access. */ - typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; /** \internal the return type of MatrixBase::eigenvalues() */ typedef Matrix::Scalar>::Real, internal::traits::ColsAtCompileTime, 1> EigenvaluesReturnType; @@ -297,17 +302,17 @@ template class DenseBase Derived& operator=(const ReturnByValue& func); /** \internal - * Copies \a other into *this without evaluating other. \returns a reference to *this. - * \deprecated */ + * Copies \a other into *this without evaluating other. \returns a reference to *this. */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC Derived& lazyAssign(const DenseBase& other); EIGEN_DEVICE_FUNC CommaInitializer operator<< (const Scalar& s); - /** \deprecated it now returns \c *this */ template + /** \deprecated it now returns \c *this */ EIGEN_DEPRECATED const Derived& flagged() const { return derived(); } @@ -332,12 +337,13 @@ template class DenseBase EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(const Scalar& value); - EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high); + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType + LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Index size, const Scalar& low, const Scalar& high); - EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType - LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(const Scalar& low, const Scalar& high); @@ -369,7 +375,7 @@ template class DenseBase template EIGEN_DEVICE_FUNC bool isApprox(const DenseBase& other, const RealScalar& prec = NumTraits::dummy_precision()) const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const RealScalar& other, const RealScalar& prec = NumTraits::dummy_precision()) const; template EIGEN_DEVICE_FUNC @@ -380,7 +386,7 @@ template class DenseBase EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits::dummy_precision()) const; - + inline bool hasNaN() const; inline bool allFinite() const; @@ -394,8 +400,8 @@ template class DenseBase * * Notice that in the case of a plain matrix or vector (not an expression) this function just returns * a const reference, in order to avoid a useless copy. - * - * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. + * + * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const @@ -405,12 +411,12 @@ template class DenseBase // size types on MSVC. return typename internal::eval::type(derived()); } - + /** swaps *this with the expression \a other. * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(const DenseBase& other) { EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); @@ -422,7 +428,7 @@ template class DenseBase * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(PlainObjectBase& other) { eigen_assert(rows()==other.rows() && cols()==other.cols()); @@ -443,18 +449,58 @@ template class DenseBase EIGEN_DEVICE_FUNC Scalar prod() const; + template EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff() const; + template EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff() const; - template EIGEN_DEVICE_FUNC + + // By default, the fastest version with undefined NaN propagation semantics is + // used. + // TODO(rmlarsen): Replace with default template argument when we move to + // c++11 or beyond. + EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar minCoeff() const { + return minCoeff(); + } + EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar maxCoeff() const { + return maxCoeff(); + } + + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff(IndexType* row, IndexType* col) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff(IndexType* row, IndexType* col) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff(IndexType* index) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff(IndexType* index) const; + // TODO(rmlarsen): Replace these methods with a default template argument. + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar minCoeff(IndexType* row, IndexType* col) const { + return minCoeff(row, col); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar maxCoeff(IndexType* row, IndexType* col) const { + return maxCoeff(row, col); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar minCoeff(IndexType* index) const { + return minCoeff(index); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar maxCoeff(IndexType* index) const { + return maxCoeff(index); + } + template EIGEN_DEVICE_FUNC Scalar redux(const BinaryOp& func) const; @@ -493,7 +539,7 @@ template class DenseBase typedef VectorwiseOp ColwiseReturnType; typedef const VectorwiseOp ConstColwiseReturnType; - /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations + /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions * * Example: \include MatrixBase_rowwise.cpp * Output: \verbinclude MatrixBase_rowwise.out @@ -506,7 +552,7 @@ template class DenseBase } EIGEN_DEVICE_FUNC RowwiseReturnType rowwise(); - /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations + /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out @@ -524,16 +570,16 @@ template class DenseBase static const RandomReturnType Random(); template - const Select + inline EIGEN_DEVICE_FUNC const Select select(const DenseBase& thenMatrix, const DenseBase& elseMatrix) const; template - inline const Select + inline EIGEN_DEVICE_FUNC const Select select(const DenseBase& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const; template - inline const Select + inline EIGEN_DEVICE_FUNC const Select select(const typename ElseDerived::Scalar& thenScalar, const DenseBase& elseMatrix) const; template RealScalar lpNorm() const; @@ -567,16 +613,59 @@ template class DenseBase } EIGEN_DEVICE_FUNC void reverseInPlace(); + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** STL-like RandomAccessIterator + * iterator type as returned by the begin() and end() methods. + */ + typedef random_access_iterator_type iterator; + /** This is the const version of iterator (aka read-only) */ + typedef random_access_iterator_type const_iterator; + #else + typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + >::type iterator_type; + + typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + >::type const_iterator_type; + + // Stl-style iterators are supported only for vectors. + + typedef typename internal::conditional< IsVectorAtCompileTime, + iterator_type, + void + >::type iterator; + + typedef typename internal::conditional< IsVectorAtCompileTime, + const_iterator_type, + void + >::type const_iterator; + #endif + + inline iterator begin(); + inline const_iterator begin() const; + inline const_iterator cbegin() const; + inline iterator end(); + inline const_iterator end() const; + inline const_iterator cend() const; + #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) +#define EIGEN_DOC_UNARY_ADDONS(X,Y) +# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/BlockMethods.h" +# include "../plugins/IndexedViewMethods.h" +# include "../plugins/ReshapedMethods.h" # ifdef EIGEN_DENSEBASE_PLUGIN # include EIGEN_DENSEBASE_PLUGIN # endif #undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF +#undef EIGEN_DOC_UNARY_ADDONS // disable the use of evalTo for dense objects with a nice compilation error template diff --git a/inst/include/Eigen/src/Core/DenseCoeffsBase.h b/inst/include/Eigen/src/Core/DenseCoeffsBase.h index c4af48ab..37fcdb59 100644 --- a/inst/include/Eigen/src/Core/DenseCoeffsBase.h +++ b/inst/include/Eigen/src/Core/DenseCoeffsBase.h @@ -22,11 +22,12 @@ template struct add_const_on_value_type_if_arithmetic /** \brief Base class providing read-only coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * \tparam #ReadOnlyAccessors Constant indicating read-only access + * + * \note #ReadOnlyAccessors Constant indicating read-only access * * This class defines the \c operator() \c const function and friends, which can be used to read specific * entries of a matrix or array. - * + * * \sa DenseCoeffsBase, DenseCoeffsBase, * \ref TopicClassHierarchy */ @@ -288,12 +289,13 @@ class DenseCoeffsBase : public EigenBase /** \brief Base class providing read/write coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * \tparam #WriteAccessors Constant indicating read/write access + * + * \note #WriteAccessors Constant indicating read/write access * * This class defines the non-const \c operator() function and friends, which can be used to write specific * entries of a matrix or array. This class inherits DenseCoeffsBase which * defines the const variant for reading specific entries. - * + * * \sa DenseCoeffsBase, \ref TopicClassHierarchy */ template @@ -466,7 +468,8 @@ class DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read-only using @@ -492,7 +495,7 @@ class DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read/write using @@ -566,8 +570,8 @@ class DenseCoeffsBase * * \sa outerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } @@ -577,14 +581,14 @@ class DenseCoeffsBase * * \sa innerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } // FIXME shall we remove it ? - inline Index stride() const + EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); } @@ -593,8 +597,8 @@ class DenseCoeffsBase * * \sa innerStride(), outerStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index rowStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rowStride() const EIGEN_NOEXCEPT { return Derived::IsRowMajor ? outerStride() : innerStride(); } @@ -603,8 +607,8 @@ class DenseCoeffsBase * * \sa innerStride(), outerStride(), rowStride() */ - EIGEN_DEVICE_FUNC - inline Index colStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index colStride() const EIGEN_NOEXCEPT { return Derived::IsRowMajor ? innerStride() : outerStride(); } @@ -615,7 +619,7 @@ namespace internal { template struct first_aligned_impl { - static inline Index run(const Derived&) + static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT { return 0; } }; diff --git a/inst/include/Eigen/src/Core/DenseStorage.h b/inst/include/Eigen/src/Core/DenseStorage.h index 7d6d4e66..08ef6c53 100644 --- a/inst/include/Eigen/src/Core/DenseStorage.h +++ b/inst/include/Eigen/src/Core/DenseStorage.h @@ -47,21 +47,21 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array() - { + { check_static_allocation_size(); } EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) - { + { check_static_allocation_size(); } }; #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) -#elif EIGEN_GNUC_AT_LEAST(4,7) - // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. +#elif EIGEN_GNUC_AT_LEAST(4,7) + // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template @@ -85,15 +85,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() + plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -104,15 +104,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() - { + plain_array() + { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -123,15 +123,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() + plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -142,15 +142,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() - { + plain_array() + { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -163,6 +163,30 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; +struct plain_array_helper { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void copy(const plain_array& src, const Eigen::Index size, + plain_array& dst) { + smart_copy(src.array, src.array + size, dst.array); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void swap(plain_array& a, const Eigen::Index a_size, + plain_array& b, const Eigen::Index b_size) { + if (a_size < b_size) { + std::swap_ranges(b.array, b.array + a_size, a.array); + smart_move(b.array + a_size, b.array + b_size, a.array + a_size); + } else if (a_size > b_size) { + std::swap_ranges(a.array, a.array + b_size, b.array); + smart_move(a.array + b_size, a.array + a_size, b.array + b_size); + } else { + std::swap_ranges(a.array, a.array + a_size, b.array); + } + } +}; + } // end namespace internal /** \internal @@ -190,16 +214,41 @@ template class DenseSt EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} - EIGEN_DEVICE_FUNC +#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } - EIGEN_DEVICE_FUNC +#else + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default; +#endif +#if !EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) - { + { if (this != &other) m_data = other.m_data; - return *this; + return *this; + } +#else + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default; +#endif +#if EIGEN_HAS_RVALUE_REFERENCES +#if !EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT + : m_data(std::move(other.m_data)) + { } + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT + { + if (this != &other) + m_data = std::move(other.m_data); + return *this; + } +#else + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default; + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default; +#endif +#endif EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); @@ -207,9 +256,11 @@ template class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } - EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} - EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data, other.m_data); + } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {} EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } @@ -226,8 +277,8 @@ template class DenseStorage class DenseStorage class DenseStorage class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } - EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} - EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } + EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*m_cols); @@ -446,7 +524,7 @@ template class DenseStorageswap(tmp); } return *this; - } + } #if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT @@ -459,16 +537,18 @@ template class DenseStorage(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } - EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} - EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} + EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, _Rows*m_cols); @@ -520,7 +600,7 @@ template class DenseStorageswap(tmp); } return *this; - } + } #if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT @@ -533,16 +613,18 @@ template class DenseStorage(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } - EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} - EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } + EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*_Cols); diff --git a/inst/include/Eigen/src/Core/Diagonal.h b/inst/include/Eigen/src/Core/Diagonal.h index afcaf357..3112d2c1 100644 --- a/inst/include/Eigen/src/Core/Diagonal.h +++ b/inst/include/Eigen/src/Core/Diagonal.h @@ -11,7 +11,7 @@ #ifndef EIGEN_DIAGONAL_H #define EIGEN_DIAGONAL_H -namespace Eigen { +namespace Eigen { /** \class Diagonal * \ingroup Core_Module @@ -84,20 +84,16 @@ template class Diagonal : numext::mini(m_matrix.rows(),m_matrix.cols()-m_index.value()); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return 1; } - EIGEN_DEVICE_FUNC - inline Index innerStride() const - { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride() + 1; } - EIGEN_DEVICE_FUNC - inline Index outerStride() const - { - return 0; - } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return 0; } typedef typename internal::conditional< internal::is_lvalue::value, @@ -149,8 +145,8 @@ template class Diagonal } EIGEN_DEVICE_FUNC - inline const typename internal::remove_all::type& - nestedExpression() const + inline const typename internal::remove_all::type& + nestedExpression() const { return m_matrix; } @@ -167,12 +163,12 @@ template class Diagonal private: // some compilers may fail to optimize std::max etc in case of compile-time constants... - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; } // trigger a compile-time error if someone try to call packet template typename MatrixType::PacketReturnType packet(Index) const; template typename MatrixType::PacketReturnType packet(Index,Index) const; @@ -187,7 +183,7 @@ template class Diagonal * * \sa class Diagonal */ template -inline typename MatrixBase::DiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalReturnType MatrixBase::diagonal() { return DiagonalReturnType(derived()); @@ -195,7 +191,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -213,7 +209,7 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) { return DiagonalDynamicIndexReturnType(derived(), index); @@ -221,7 +217,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); @@ -240,6 +236,7 @@ MatrixBase::diagonal(Index index) const * \sa MatrixBase::diagonal(), class Diagonal */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template DiagonalIndexReturnType::Type MatrixBase::diagonal() { @@ -249,6 +246,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type MatrixBase::diagonal() const { diff --git a/inst/include/Eigen/src/Core/DiagonalMatrix.h b/inst/include/Eigen/src/Core/DiagonalMatrix.h index ecfdce8e..542685c6 100644 --- a/inst/include/Eigen/src/Core/DiagonalMatrix.h +++ b/inst/include/Eigen/src/Core/DiagonalMatrix.h @@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } - + EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } EIGEN_DEVICE_FUNC @@ -83,6 +83,30 @@ class DiagonalBase : public EigenBase { return DiagonalWrapper(scalar * other.diagonal()); } + + template + EIGEN_DEVICE_FUNC + #ifdef EIGEN_PARSED_BY_DOXYGEN + inline unspecified_expression_type + #else + inline const DiagonalWrapper + #endif + operator+(const DiagonalBase& other) const + { + return (diagonal() + other.diagonal()).asDiagonal(); + } + + template + EIGEN_DEVICE_FUNC + #ifdef EIGEN_PARSED_BY_DOXYGEN + inline unspecified_expression_type + #else + inline const DiagonalWrapper + #endif + operator-(const DiagonalBase& other) const + { + return (diagonal() - other.diagonal()).asDiagonal(); + } }; #endif @@ -154,6 +178,30 @@ class DiagonalMatrix EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {} + #if EIGEN_HAS_CXX11 + /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11 + * + * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients. + * + * \warning To construct a diagonal matrix of fixed size, the number of values passed to this + * constructor must match the fixed dimension of \c *this. + * + * \sa DiagonalMatrix(const Scalar&, const Scalar&) + * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args) + : m_diagonal(a0, a1, a2, args...) {} + + /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list>& list) + : m_diagonal(list) {} + #endif // EIGEN_HAS_CXX11 + /** Copy constructor. */ template EIGEN_DEVICE_FUNC @@ -273,7 +321,7 @@ class DiagonalWrapper * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() **/ template -inline const DiagonalWrapper +EIGEN_DEVICE_FUNC inline const DiagonalWrapper MatrixBase::asDiagonal() const { return DiagonalWrapper(derived()); diff --git a/inst/include/Eigen/src/Core/DiagonalProduct.h b/inst/include/Eigen/src/Core/DiagonalProduct.h index d372b938..7911d1cd 100644 --- a/inst/include/Eigen/src/Core/DiagonalProduct.h +++ b/inst/include/Eigen/src/Core/DiagonalProduct.h @@ -17,7 +17,7 @@ namespace Eigen { */ template template -inline const Product +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const DiagonalBase &a_diagonal) const { return Product(derived(),a_diagonal.derived()); diff --git a/inst/include/Eigen/src/Core/Dot.h b/inst/include/Eigen/src/Core/Dot.h index 1fe7a84a..5c3441b9 100644 --- a/inst/include/Eigen/src/Core/Dot.h +++ b/inst/include/Eigen/src/Core/Dot.h @@ -86,14 +86,14 @@ MatrixBase::dot(const MatrixBase& other) const //---------- implementation of L2 norm and related functions ---------- -/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm. +/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm. * In both cases, it consists in the sum of the square of all the matrix entries. * For vectors, this is also equals to the dot product of \c *this with itself. * * \sa dot(), norm(), lpNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const { return numext::real((*this).cwiseAbs2().sum()); } @@ -105,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -120,7 +120,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa norm(), normalize() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -142,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -EIGEN_STRONG_INLINE void MatrixBase::normalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -163,7 +163,7 @@ EIGEN_STRONG_INLINE void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -188,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); @@ -207,7 +207,7 @@ struct lpNorm_selector EIGEN_DEVICE_FUNC static inline RealScalar run(const MatrixBase& m) { - EIGEN_USING_STD_MATH(pow) + EIGEN_USING_STD(pow) return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p); } }; @@ -260,9 +260,9 @@ struct lpNorm_selector template template #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NumTraits::Scalar>::Real +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real #else -MatrixBase::RealScalar +EIGEN_DEVICE_FUNC MatrixBase::RealScalar #endif MatrixBase::lpNorm() const { diff --git a/inst/include/Eigen/src/Core/EigenBase.h b/inst/include/Eigen/src/Core/EigenBase.h index b195506a..6b3c7d37 100644 --- a/inst/include/Eigen/src/Core/EigenBase.h +++ b/inst/include/Eigen/src/Core/EigenBase.h @@ -15,7 +15,7 @@ namespace Eigen { /** \class EigenBase * \ingroup Core_Module - * + * * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * * In other words, an EigenBase object is an object that can be copied into a MatrixBase. @@ -29,11 +29,12 @@ namespace Eigen { template struct EigenBase { // typedef typename internal::plain_matrix_type::type PlainObject; - + /** \brief The interface type of indices * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. - * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. * \sa StorageIndex, \ref TopicPreprocessorDirectives. + * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. + * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute. */ typedef Eigen::Index Index; @@ -55,15 +56,15 @@ template struct EigenBase { return *static_cast(this); } /** \returns the number of rows. \sa cols(), RowsAtCompileTime */ - EIGEN_DEVICE_FUNC - inline Index rows() const { return derived().rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/ - EIGEN_DEVICE_FUNC - inline Index cols() const { return derived().cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } /** \returns the number of coefficients, which is rows()*cols(). * \sa rows(), cols(), SizeAtCompileTime. */ - EIGEN_DEVICE_FUNC - inline Index size() const { return rows() * cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); } /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */ template diff --git a/inst/include/Eigen/src/Core/ForceAlignedAccess.h b/inst/include/Eigen/src/Core/ForceAlignedAccess.h index 7b08b45e..817a43af 100644 --- a/inst/include/Eigen/src/Core/ForceAlignedAccess.h +++ b/inst/include/Eigen/src/Core/ForceAlignedAccess.h @@ -41,10 +41,14 @@ template class ForceAlignedAccess EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const { diff --git a/inst/include/Eigen/src/Core/Fuzzy.h b/inst/include/Eigen/src/Core/Fuzzy.h index 3e403a09..43aa49b2 100644 --- a/inst/include/Eigen/src/Core/Fuzzy.h +++ b/inst/include/Eigen/src/Core/Fuzzy.h @@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector */ template template -bool DenseBase::isApprox( +EIGEN_DEVICE_FUNC bool DenseBase::isApprox( const DenseBase& other, const RealScalar& prec ) const @@ -122,7 +122,7 @@ bool DenseBase::isApprox( * \sa isApprox(), isMuchSmallerThan(const DenseBase&, RealScalar) const */ template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const typename NumTraits::Real& other, const RealScalar& prec ) const @@ -142,7 +142,7 @@ bool DenseBase::isMuchSmallerThan( */ template template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const DenseBase& other, const RealScalar& prec ) const diff --git a/inst/include/Eigen/src/Core/GeneralProduct.h b/inst/include/Eigen/src/Core/GeneralProduct.h index 6f0cc80e..6906aa75 100644 --- a/inst/include/Eigen/src/Core/GeneralProduct.h +++ b/inst/include/Eigen/src/Core/GeneralProduct.h @@ -18,6 +18,16 @@ enum { Small = 3 }; +// Define the threshold value to fallback from the generic matrix-matrix product +// implementation (heavy) to the lightweight coeff-based product one. +// See generic_product_impl +// in products/GeneralMatrixMatrix.h for more details. +// TODO This threshold should also be used in the compile-time selector below. +#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD +// This default value has been obtained on a Haswell architecture. +#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20 +#endif + namespace internal { template struct product_type_selector; @@ -25,7 +35,7 @@ template struct product_type_selector; template struct product_size_category { enum { - #ifndef EIGEN_CUDA_ARCH + #ifndef EIGEN_GPU_COMPILE_PHASE is_large = MaxSize == Dynamic || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), @@ -153,13 +163,13 @@ template struct gemv_static_vect template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } }; template @@ -218,8 +228,7 @@ template<> struct gemv_dense_selector ActualLhsType actualLhs = LhsBlasTraits::extract(lhs); ActualRhsType actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); // make sure Dest is a compile-time vector type (bug 1166) typedef typename conditional::type ActualDest; @@ -229,7 +238,7 @@ template<> struct gemv_dense_selector // on, the other hand it is good for the cache to pack the vector anyways... EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), - MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal + MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0) }; typedef const_blas_data_mapper LhsMapper; @@ -310,13 +319,12 @@ template<> struct gemv_dense_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 // on, the other hand it is good for the cache to pack the vector anyways... - DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0 }; gemv_static_vector_if static_rhs; @@ -386,7 +394,8 @@ template<> struct gemv_dense_selector */ template template -inline const Product +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const Product MatrixBase::operator*(const MatrixBase &other) const { // A note regarding the function declaration: In MSVC, this function will sometimes @@ -428,6 +437,7 @@ MatrixBase::operator*(const MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Product MatrixBase::lazyProduct(const MatrixBase &other) const { diff --git a/inst/include/Eigen/src/Core/GenericPacketMath.h b/inst/include/Eigen/src/Core/GenericPacketMath.h index e5944377..cf677a19 100644 --- a/inst/include/Eigen/src/Core/GenericPacketMath.h +++ b/inst/include/Eigen/src/Core/GenericPacketMath.h @@ -44,23 +44,29 @@ struct default_packet_traits enum { HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, HasSetLinear = 1, - HasBlend = 0, + HasBlend = 0, + // This flag is used to indicate whether packet comparison is supported. + // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true. + HasCmp = 0, HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, + HasExpm1 = 0, HasLog = 0, HasLog1p = 0, HasLog10 = 0, @@ -81,14 +87,18 @@ struct default_packet_traits HasPolygamma = 0, HasErf = 0, HasErfc = 0, + HasNdtri = 0, + HasBessel = 0, HasIGamma = 0, + HasIGammaDerA = 0, + HasGammaSampleDerAlpha = 0, HasIGammac = 0, HasBetaInc = 0, HasRound = 0, + HasRint = 0, HasFloor = 0, HasCeil = 0, - HasSign = 0 }; }; @@ -119,6 +129,22 @@ template struct packet_traits : default_packet_traits template struct packet_traits : packet_traits { }; +template struct unpacket_traits +{ + typedef T type; + typedef T half; + enum + { + size = 1, + alignment = 1, + vectorizable = false, + masked_load_available=false, + masked_store_available=false + }; +}; + +template struct unpacket_traits : unpacket_traits { }; + template struct type_casting_traits { enum { VectorizedCast = 0, @@ -127,6 +153,34 @@ template struct type_casting_traits { }; }; +/** \internal Wrapper to ensure that multiple packet types can map to the same + same underlying vector type. */ +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; + + +/** \internal A convenience utility for determining if the type is a scalar. + * This is used to enable some generic packet implementations. + */ +template +struct is_scalar { + typedef typename unpacket_traits::type Scalar; + enum { + value = internal::is_same::value + }; +}; /** \internal \returns static_cast(a) (coeff-wise) */ template @@ -139,75 +193,406 @@ EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& /*b*/) { return static_cast(a); } - template EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) { return static_cast(a); } +template +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/, + const SrcPacket& /*e*/, const SrcPacket& /*f*/, const SrcPacket& /*g*/, const SrcPacket& /*h*/) { + return static_cast(a); +} + +/** \internal \returns reinterpret_cast(a) */ +template +EIGEN_DEVICE_FUNC inline Target +preinterpret(const Packet& a); /* { return reinterpret_cast(a); } */ /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -padd(const Packet& a, - const Packet& b) { return a+b; } +padd(const Packet& a, const Packet& b) { return a+b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +padd(const bool& a, const bool& b) { return a || b; } /** \internal \returns a - b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -psub(const Packet& a, - const Packet& b) { return a-b; } +psub(const Packet& a, const Packet& b) { return a-b; } /** \internal \returns -a (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pnegate(const Packet& a) { return -a; } -/** \internal \returns conj(a) (coeff-wise) */ +template<> EIGEN_DEVICE_FUNC inline bool +pnegate(const bool& a) { return !a; } +/** \internal \returns conj(a) (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pconj(const Packet& a) { return numext::conj(a); } /** \internal \returns a * b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmul(const Packet& a, - const Packet& b) { return a*b; } +pmul(const Packet& a, const Packet& b) { return a*b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +pmul(const bool& a, const bool& b) { return a && b; } /** \internal \returns a / b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pdiv(const Packet& a, - const Packet& b) { return a/b; } +pdiv(const Packet& a, const Packet& b) { return a/b; } + +// In the generic case, memset to all one bits. +template +struct ptrue_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){ + Packet b; + memset(static_cast(&b), 0xff, sizeof(Packet)); + return b; + } +}; -/** \internal \returns the min of \a a and \a b (coeff-wise) */ +// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value). +// Although this is technically not a valid bitmask, the scalar path for pselect +// uses a comparison to zero, so this should still work in most cases. We don't +// have another option, since the scalar type requires initialization. +template +struct ptrue_impl::value && NumTraits::RequireInitialization>::type > { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){ + return T(1); + } +}; + +/** \internal \returns one bits. */ template EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, - const Packet& b) { return numext::mini(a, b); } +ptrue(const Packet& a) { + return ptrue_impl::run(a); +} + +// In the general case, memset to zero. +template +struct pzero_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) { + Packet b; + memset(static_cast(&b), 0x00, sizeof(Packet)); + return b; + } +}; + +// For scalars, explicitly set to Scalar(0), since the underlying representation +// for zero may not consist of all-zero bits. +template +struct pzero_impl::value>::type> { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { + return T(0); + } +}; -/** \internal \returns the max of \a a and \a b (coeff-wise) */ +/** \internal \returns packet of zeros */ template EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, - const Packet& b) { return numext::maxi(a, b); } +pzero(const Packet& a) { + return pzero_impl::run(a); +} -/** \internal \returns the absolute value of \a a */ +/** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pabs(const Packet& a) { using std::abs; return abs(a); } +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } -/** \internal \returns the phase angle of \a a */ +/** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -parg(const Packet& a) { using numext::arg; return arg(a); } +pcmp_lt(const Packet& a, const Packet& b) { return a EIGEN_DEVICE_FUNC inline Packet +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } + +/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } + +template +struct bit_and { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a & b; + } +}; + +template +struct bit_or { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a | b; + } +}; + +template +struct bit_xor { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a ^ b; + } +}; + +template +struct bit_not { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { + return ~a; + } +}; + +// Use operators &, |, ^, ~. +template +struct operator_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not()(a); } +}; + +// Apply binary operations byte-by-byte +template +struct bytewise_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { + return binary(a, b, bit_and()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { + return binary(a, b, bit_or()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { + return binary(a, b, bit_xor()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { + return unary(a,bit_not()); + } + + private: + template + EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++); + } + return c; + } + + template + EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + const unsigned char* b_ptr = reinterpret_cast(&b); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++, *b_ptr++); + } + return c; + } +}; + +// In the general case, use byte-by-byte manipulation. +template +struct bitwise_helper : public bytewise_bitwise_helper {}; + +// For integers or non-trivial scalars, use binary operators. +template +struct bitwise_helper::value && (NumTraits::IsInteger || NumTraits::RequireInitialization)>::type + > : public operator_bitwise_helper {}; /** \internal \returns the bitwise and of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -pand(const Packet& a, const Packet& b) { return a & b; } +pand(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_and(a, b); +} /** \internal \returns the bitwise or of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -por(const Packet& a, const Packet& b) { return a | b; } +por(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_or(a, b); +} /** \internal \returns the bitwise xor of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -pxor(const Packet& a, const Packet& b) { return a ^ b; } +pxor(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_xor(a, b); +} + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { + return bitwise_helper::bitwise_not(a); +} + +/** \internal \returns the bitwise and of \a a and not \a b */ +template EIGEN_DEVICE_FUNC inline Packet +pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); } + +// In the general case, use bitwise select. +template +struct pselect_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); + } +}; + +// For scalars, use ternary select. +template +struct pselect_impl::value>::type > { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return numext::equal_strict(mask, Packet(0)) ? b : a; + } +}; + +/** \internal \returns \a or \b for each field in packet according to \mask */ +template EIGEN_DEVICE_FUNC inline Packet +pselect(const Packet& mask, const Packet& a, const Packet& b) { + return pselect_impl::run(mask, a, b); +} + +template<> EIGEN_DEVICE_FUNC inline bool pselect( + const bool& cond, const bool& a, const bool& b) { + return cond ? a : b; +} + +/** \internal \returns the min or of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, the result is implementation defined. */ +template +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + return op(a,b); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, NaN is returned. */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), b), + a); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If both \a a and \a b are NaN, NaN is returned. + Equivalent to std::fmin(a, b). */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), a), + b); + } +}; + + +#ifndef SYCL_DEVICE_ONLY +#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func +#else +#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) \ +[](const Type& a, const Type& b) { \ + return Func(a, b);} +#endif + +/** \internal \returns the min of \a a and \a b (coeff-wise). + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmin(const Packet& a, const Packet& b) { return numext::mini(a,b); } + +/** \internal \returns the min of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template +EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) { + return pminmax_impl::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin))); +} + +/** \internal \returns the max of \a a and \a b (coeff-wise) + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } + +/** \internal \returns the max of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template +EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) { + return pminmax_impl::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet,(pmax))); +} + +/** \internal \returns the absolute value of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pabs(const Packet& a) { return numext::abs(a); } +template<> EIGEN_DEVICE_FUNC inline unsigned int +pabs(const unsigned int& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long +pabs(const unsigned long& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long long +pabs(const unsigned long long& a) { return a; } + +/** \internal \returns the addsub value of \a a,b */ +template EIGEN_DEVICE_FUNC inline Packet +paddsub(const Packet& a, const Packet& b) { + return pselect(peven_mask(a), padd(a, b), psub(a, b)); + } + +/** \internal \returns the phase angle of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +parg(const Packet& a) { using numext::arg; return arg(a); } + + +/** \internal \returns \a a logically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +parithmetic_shift_right(const int& a) { return a >> N; } +template EIGEN_DEVICE_FUNC inline long int +parithmetic_shift_right(const long int& a) { return a >> N; } + +/** \internal \returns \a a arithmetically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_right(const int& a) { return static_cast(static_cast(a) >> N); } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_right(const long int& a) { return static_cast(static_cast(a) >> N); } -/** \internal \returns the bitwise andnot of \a a and \a b */ +/** \internal \returns \a a shifted by N bits to the left */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_left(const int& a) { return a << N; } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_left(const long int& a) { return a << N; } + +/** \internal \returns the significant and exponent of the underlying floating point numbers + * See https://en.cppreference.com/w/cpp/numeric/math/frexp + */ +template +EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) { + int exp; + EIGEN_USING_STD(frexp); + Packet result = static_cast(frexp(a, &exp)); + exponent = static_cast(exp); + return result; +} + +/** \internal \returns a * 2^((int)exponent) + * See https://en.cppreference.com/w/cpp/numeric/math/ldexp + */ +template EIGEN_DEVICE_FUNC inline Packet +pldexp(const Packet &a, const Packet &exponent) { + EIGEN_USING_STD(ldexp) + return static_cast(ldexp(a, static_cast(exponent))); +} + +/** \internal \returns the min of \a a and \a b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (!b); } +pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -217,10 +602,22 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet version of \a *from, (un-aligned masked load) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template EIGEN_DEVICE_FUNC inline +typename enable_if::masked_load_available, Packet>::type +ploadu(const typename unpacket_traits::type* from, typename unpacket_traits::mask_t umask); + /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } @@ -237,7 +634,7 @@ ploaddup(const typename unpacket_traits::type* from) { return *from; } * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} * Currently, this function is only used in matrix products. - * For packet-size smaller or equal to 4, this function is equivalent to pload1 + * For packet-size smaller or equal to 4, this function is equivalent to pload1 */ template EIGEN_DEVICE_FUNC inline Packet ploadquad(const typename unpacket_traits::type* from) @@ -281,6 +678,20 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0), + where x is the value of all 1-bits. */ +template EIGEN_DEVICE_FUNC inline Packet +peven_mask(const Packet& /*a*/) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n]; + for(size_t i = 0; i < n; ++i) { + memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar)); + } + return ploadu(elements); +} + + /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) { (*to) = from; } @@ -289,6 +700,15 @@ template EIGEN_DEVICE_FUNC inline void pstore( template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template +EIGEN_DEVICE_FUNC inline +typename enable_if::masked_store_available, void>::type +pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t umask); + template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) { return ploadu(from); } @@ -298,8 +718,10 @@ template EIGEN_DEVICE_FUNC inline void pstoreu /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef __CUDA_ARCH__ -#if defined(__LP64__) +#if defined(EIGEN_HIP_DEVICE_COMPILE) + // do nothing +#elif defined(EIGEN_CUDA_ARCH) +#if defined(__LP64__) || EIGEN_OS_WIN64 // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); #else @@ -311,39 +733,6 @@ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a #endif } -/** \internal \returns the first element of a packet */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Packet& a) -{ return a; } - -/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ -template EIGEN_DEVICE_FUNC inline Packet -preduxp(const Packet* vecs) { return vecs[0]; } - -/** \internal \returns the sum of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) -{ return a; } - -/** \internal \returns the sum of the elements of \a a by block of 4 elements. - * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} - * For packet-size smaller or equal to 4, this boils down to a noop. - */ -template EIGEN_DEVICE_FUNC inline -typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_downto4(const Packet& a) -{ return a; } - -/** \internal \returns the product of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) -{ return a; } - -/** \internal \returns the min of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) -{ return a; } - -/** \internal \returns the max of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) -{ return a; } - /** \internal \returns the reversed elements of \a a*/ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } @@ -351,7 +740,7 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { - return Packet(a.imag(),a.real()); + return Packet(numext::imag(a),numext::real(a)); } /************************** @@ -360,47 +749,51 @@ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet /** \internal \returns the sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psin(const Packet& a) { using std::sin; return sin(a); } +Packet psin(const Packet& a) { EIGEN_USING_STD(sin); return sin(a); } /** \internal \returns the cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcos(const Packet& a) { using std::cos; return cos(a); } +Packet pcos(const Packet& a) { EIGEN_USING_STD(cos); return cos(a); } /** \internal \returns the tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptan(const Packet& a) { using std::tan; return tan(a); } +Packet ptan(const Packet& a) { EIGEN_USING_STD(tan); return tan(a); } /** \internal \returns the arc sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pasin(const Packet& a) { using std::asin; return asin(a); } +Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); } /** \internal \returns the arc cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pacos(const Packet& a) { using std::acos; return acos(a); } +Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); } /** \internal \returns the arc tangent of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet patan(const Packet& a) { using std::atan; return atan(a); } +Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); } /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psinh(const Packet& a) { using std::sinh; return sinh(a); } +Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); } /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); } +Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); } /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); } +Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); } /** \internal \returns the exp of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pexp(const Packet& a) { using std::exp; return exp(a); } +Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); } + +/** \internal \returns the expm1 of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pexpm1(const Packet& a) { return numext::expm1(a); } /** \internal \returns the log of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog(const Packet& a) { using std::log; return log(a); } +Packet plog(const Packet& a) { EIGEN_USING_STD(log); return log(a); } /** \internal \returns the log1p of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -408,16 +801,24 @@ Packet plog1p(const Packet& a) { return numext::log1p(a); } /** \internal \returns the log10 of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog10(const Packet& a) { using std::log10; return log10(a); } +Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); } + +/** \internal \returns the log10 of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plog2(const Packet& a) { + typedef typename internal::unpacket_traits::type Scalar; + return pmul(pset1(Scalar(EIGEN_LOG2E)), plog(a)); +} /** \internal \returns the square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); } +Packet psqrt(const Packet& a) { return numext::sqrt(a); } /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet prsqrt(const Packet& a) { - return pdiv(pset1(1), psqrt(a)); + typedef typename internal::unpacket_traits::type Scalar; + return pdiv(pset1(Scalar(1)), psqrt(a)); } /** \internal \returns the rounded value of \a a (coeff-wise) */ @@ -428,15 +829,121 @@ Packet pround(const Packet& a) { using numext::round; return round(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } +/** \internal \returns the rounded value of \a a (coeff-wise) with current + * rounding mode */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet print(const Packet& a) { using numext::rint; return rint(a); } + /** \internal \returns the ceil of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +/** \internal \returns the first element of a packet */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +pfirst(const Packet& a) +{ return a; } + +/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. + * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} + * For packet-size smaller or equal to 4, this boils down to a noop. + */ +template +EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +predux_half_dowto4(const Packet& a) +{ return a; } + +// Slow generic implementation of Packet reduction. +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_helper(const Packet& a, Op op) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n]; + pstoreu(elements, a); + for(size_t k = n / 2; k > 0; k /= 2) { + for(size_t i = 0; i < k; ++i) { + elements[i] = op(elements[i], elements[i + k]); + } + } + return elements[0]; +} + +/** \internal \returns the sum of the elements of \a a*/ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux(const Packet& a) +{ + return a; +} + +/** \internal \returns the product of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul))); +} + +/** \internal \returns the min of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( + const Packet &a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); +} + +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); +} + +/** \internal \returns the min of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( + const Packet &a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); +} + +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); +} + +#undef EIGEN_BINARY_OP_NAN_PROPAGATION + +/** \internal \returns true if all coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +// not needed yet +// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) +// { return bool(a); } + +/** \internal \returns true if any coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) +{ + // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. + // It is expected that "true" is either: + // - Scalar(1) + // - bits full of ones (NaN for floats), + // - or first bit equals to 1 (1 for ints, smallest denormal for floats). + // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. + typedef typename unpacket_traits::type Scalar; + return numext::not_equal_strict(predux(a), Scalar(0)); +} + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ -/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ +/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) template inline void pstore1(typename unpacket_traits::type* to, const typename unpacket_traits::type& a) @@ -484,41 +991,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_t return ploadt(from); } -/** \internal default implementation of palign() allowing partial specialization */ -template -struct palign_impl -{ - // by default data are aligned, so there is nothing to be done :) - static inline void run(PacketType&, const PacketType&) {} -}; - -/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements - * of \a first and \a Offset first elements of \a second. - * - * This function is currently only used to optimize matrix-vector products on unligned matrices. - * It takes 2 packets that represent a contiguous memory array, and returns a packet starting - * at the position \a Offset. For instance, for packets of 4 elements, we have: - * Input: - * - first = {f0,f1,f2,f3} - * - second = {s0,s1,s2,s3} - * Output: - * - if Offset==0 then {f0,f1,f2,f3} - * - if Offset==1 then {f1,f2,f3,s0} - * - if Offset==2 then {f2,f3,s0,s1} - * - if Offset==3 then {f3,s0,s1,s3} - */ -template -inline void palign(PacketType& first, const PacketType& second) -{ - palign_impl::run(first,second); -} - /*************************************************************************** * Fast complex products (GCC generates a function call which is very slow) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef __CUDACC__ +#if !defined(EIGEN_GPUCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } @@ -555,34 +1033,6 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } -/** \internal \returns \a a with the first coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertfirst(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - mask.select[0] = true; - // This for loop should be optimized away by the compiler. - for(Index i=1; i::size; ++i) - mask.select[i] = false; - return pblend(mask, pset1(b), a); -} - -/** \internal \returns \a a with the last coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertlast(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - // This for loop should be optimized away by the compiler. - for(Index i=0; i::size-1; ++i) - mask.select[i] = false; - mask.select[unpacket_traits::size-1] = true; - return pblend(mask, pset1(b), a); -} - } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/GlobalFunctions.h b/inst/include/Eigen/src/Core/GlobalFunctions.h index 769dc255..629af94b 100644 --- a/inst/include/Eigen/src/Core/GlobalFunctions.h +++ b/inst/include/Eigen/src/Core/GlobalFunctions.h @@ -66,21 +66,31 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) +#if EIGEN_HAS_CXX11_MATH + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh) +#endif + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil) @@ -88,7 +98,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign) - + /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent. * * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar). @@ -102,17 +112,18 @@ namespace Eigen inline const CwiseBinaryOp,Derived,Constant > pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent), - const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) { - return x.derived().pow(exponent); - } - - template - inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow) - pow(const Eigen::ArrayBase& x, const typename Derived::Scalar& exponent) { - return x.derived().pow(exponent); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,pow)) + pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) + { + typedef typename internal::promote_scalar_arg::type PromotedExponent; + return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(), + typename internal::plain_constant_type::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op(exponent))); } #endif @@ -122,21 +133,21 @@ namespace Eigen * * Example: \include Cwise_array_power_array.cpp * Output: \verbinclude Cwise_array_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase */ template inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> - pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) + pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( x.derived(), exponents.derived() ); } - + /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents. * * This function computes the coefficient-wise power between a scalar and an array of exponents. @@ -145,7 +156,7 @@ namespace Eigen * * Example: \include Cwise_scalar_power_array.cpp * Output: \verbinclude Cwise_scalar_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase @@ -155,21 +166,17 @@ namespace Eigen inline const CwiseBinaryOp,Constant,Derived> pow(const Scalar& x,const Eigen::ArrayBase& x); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar), - const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type - pow(const Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); - } - - template - inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow) - pow(const typename Derived::Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,pow)) + pow(const Scalar& x, const Eigen::ArrayBase& exponents) { + typedef typename internal::promote_scalar_arg::type PromotedScalar; + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)( + typename internal::plain_constant_type::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op(x)), exponents.derived()); } #endif diff --git a/inst/include/Eigen/src/Core/IO.h b/inst/include/Eigen/src/Core/IO.h index da7fd6cc..e81c3152 100644 --- a/inst/include/Eigen/src/Core/IO.h +++ b/inst/include/Eigen/src/Core/IO.h @@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& * - \b rowSuffix string printed at the end of each row * - \b matPrefix string printed at the beginning of the matrix * - \b matSuffix string printed at the end of the matrix + * - \b fill character printed to fill the empty space in aligned columns * * Example: \include IOFormat.cpp * Output: \verbinclude IOFormat.out @@ -53,9 +54,9 @@ struct IOFormat IOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _coeffSeparator = " ", const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="", - const std::string& _matPrefix="", const std::string& _matSuffix="") + const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ') : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator), - rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags) + rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags) { // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline // don't add rowSpacer if columns are not to be aligned @@ -71,6 +72,7 @@ struct IOFormat std::string matPrefix, matSuffix; std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer; std::string coeffSeparator; + char fill; int precision; int flags; }; @@ -128,6 +130,9 @@ struct significant_decimals_impl template std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt) { + using internal::is_same; + using internal::conditional; + if(_m.size() == 0) { s << fmt.matPrefix << fmt.matSuffix; @@ -136,6 +141,22 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& typename Derived::Nested m = _m; typedef typename Derived::Scalar Scalar; + typedef typename + conditional< + is_same::value || + is_same::value || + is_same::value || + is_same::value, + int, + typename conditional< + is_same >::value || + is_same >::value || + is_same >::value || + is_same >::value, + std::complex, + const Scalar& + >::type + >::type PrintType; Index width = 0; @@ -172,23 +193,31 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& { std::stringstream sstr; sstr.copyfmt(s); - sstr << m.coeff(i,j); + sstr << static_cast(m.coeff(i,j)); width = std::max(width, Index(sstr.str().length())); } } + std::streamsize old_width = s.width(); + char old_fill_character = s.fill(); s << fmt.matPrefix; for(Index i = 0; i < m.rows(); ++i) { if (i) s << fmt.rowSpacer; s << fmt.rowPrefix; - if(width) s.width(width); - s << m.coeff(i, 0); + if(width) { + s.fill(fmt.fill); + s.width(width); + } + s << static_cast(m.coeff(i, 0)); for(Index j = 1; j < m.cols(); ++j) { s << fmt.coeffSeparator; - if (width) s.width(width); - s << m.coeff(i, j); + if(width) { + s.fill(fmt.fill); + s.width(width); + } + s << static_cast(m.coeff(i, j)); } s << fmt.rowSuffix; if( i < m.rows() - 1) @@ -196,6 +225,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& } s << fmt.matSuffix; if(explicit_precision) s.precision(old_precision); + if(width) { + s.fill(old_fill_character); + s.width(old_width); + } return s; } diff --git a/inst/include/Eigen/src/Core/IndexedView.h b/inst/include/Eigen/src/Core/IndexedView.h new file mode 100644 index 00000000..08476251 --- /dev/null +++ b/inst/include/Eigen/src/Core/IndexedView.h @@ -0,0 +1,237 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_INDEXED_VIEW_H +#define EIGEN_INDEXED_VIEW_H + +namespace Eigen { + +namespace internal { + +template +struct traits > + : traits +{ + enum { + RowsAtCompileTime = int(array_size::value), + ColsAtCompileTime = int(array_size::value), + MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic, + MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic, + + XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0, + IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 + : XprTypeIsRowMajor, + + RowIncr = int(get_compile_time_incr::value), + ColIncr = int(get_compile_time_incr::value), + InnerIncr = IsRowMajor ? ColIncr : RowIncr, + OuterIncr = IsRowMajor ? RowIncr : ColIncr, + + HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor), + XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time::ret) : int(outer_stride_at_compile_time::ret), + XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), + + InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime, + IsBlockAlike = InnerIncr==1 && OuterIncr==1, + IsInnerPannel = HasSameStorageOrderAsXprType && is_same,typename conditional::type>::value, + + InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr, + OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr, + + ReturnAsScalar = is_same::value && is_same::value, + ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike, + ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock), + + // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag, + // but this is too strict regarding negative strides... + DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, + FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + Flags = (traits::Flags & (HereditaryBits | DirectAccessMask )) | FlagsLvalueBit | FlagsRowMajorBit | FlagsLinearAccessBit + }; + + typedef Block BlockType; +}; + +} + +template +class IndexedViewImpl; + + +/** \class IndexedView + * \ingroup Core_Module + * + * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices + * + * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns + * \tparam RowIndices the type of the object defining the sequence of row indices + * \tparam ColIndices the type of the object defining the sequence of column indices + * + * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection + * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$ + * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$ be the nested matrix, then the resulting matrix \f$ B \f$ has \c m + * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$. + * + * The \c RowIndices and \c ColIndices types must be compatible with the following API: + * \code + * operator[](Index) const; + * Index size() const; + * \endcode + * + * Typical supported types thus include: + * - std::vector + * - std::valarray + * - std::array + * - Plain C arrays: int[N] + * - Eigen::ArrayXi + * - decltype(ArrayXi::LinSpaced(...)) + * - Any view/expressions of the previous types + * - Eigen::ArithmeticSequence + * - Eigen::internal::AllRange (helper for Eigen::all) + * - Eigen::internal::SingleRange (helper for single index) + * - etc. + * + * In typical usages of %Eigen, this class should never be used directly. It is the return type of + * DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * \sa class Block + */ +template +class IndexedView : public IndexedViewImpl::StorageKind> +{ +public: + typedef typename IndexedViewImpl::StorageKind>::Base Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView) + + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename internal::remove_all::type NestedExpression; + + template + IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices) + : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices) + {} + + /** \returns number of rows */ + Index rows() const { return internal::size(m_rowIndices); } + + /** \returns number of columns */ + Index cols() const { return internal::size(m_colIndices); } + + /** \returns the nested expression */ + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + typename internal::remove_reference::type& + nestedExpression() { return m_xpr; } + + /** \returns a const reference to the object storing/generating the row indices */ + const RowIndices& rowIndices() const { return m_rowIndices; } + + /** \returns a const reference to the object storing/generating the column indices */ + const ColIndices& colIndices() const { return m_colIndices; } + +protected: + MatrixTypeNested m_xpr; + RowIndices m_rowIndices; + ColIndices m_colIndices; +}; + + +// Generic API dispatcher +template +class IndexedViewImpl + : public internal::generic_xpr_base >::type +{ +public: + typedef typename internal::generic_xpr_base >::type Base; +}; + +namespace internal { + + +template +struct unary_evaluator, IndexBased> + : evaluator_base > +{ + typedef IndexedView XprType; + + enum { + CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of row/col index */, + + FlagsLinearAccessBit = (traits::RowsAtCompileTime == 1 || traits::ColsAtCompileTime == 1) ? LinearAccessBit : 0, + + FlagsRowMajorBit = traits::FlagsRowMajorBit, + + Flags = (evaluator::Flags & (HereditaryBits & ~RowMajorBit /*| LinearAccessBit | DirectAccessBit*/)) | FlagsLinearAccessBit | FlagsRowMajorBit, + + Alignment = 0 + }; + + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index index) const + { + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const CoeffReturnType coeff(Index index) const + { + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + +protected: + + evaluator m_argImpl; + const XprType& m_xpr; + +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_INDEXED_VIEW_H diff --git a/inst/include/Eigen/src/Core/Inverse.h b/inst/include/Eigen/src/Core/Inverse.h index b76f0439..c514438c 100644 --- a/inst/include/Eigen/src/Core/Inverse.h +++ b/inst/include/Eigen/src/Core/Inverse.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2014 Gael Guennebaud +// Copyright (C) 2014-2019 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,7 +10,7 @@ #ifndef EIGEN_INVERSE_H #define EIGEN_INVERSE_H -namespace Eigen { +namespace Eigen { template class InverseImpl; @@ -44,19 +44,18 @@ class Inverse : public InverseImpl::S { public: typedef typename XprType::StorageIndex StorageIndex; - typedef typename XprType::PlainObject PlainObject; typedef typename XprType::Scalar Scalar; typedef typename internal::ref_selector::type XprTypeNested; typedef typename internal::remove_all::type XprTypeNestedCleaned; typedef typename internal::ref_selector::type Nested; typedef typename internal::remove_all::type NestedExpression; - + explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr) : m_xpr(xpr) {} - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); } EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; } @@ -82,7 +81,7 @@ namespace internal { /** \internal * \brief Default evaluator for Inverse expression. - * + * * This default evaluator for Inverse expression simply evaluate the inverse into a temporary * by a call to internal::call_assignment_no_alias. * Therefore, inverse implementers only have to specialize Assignment, ...> for @@ -97,7 +96,7 @@ struct unary_evaluator > typedef Inverse InverseType; typedef typename InverseType::PlainObject PlainObject; typedef evaluator Base; - + enum { Flags = Base::Flags | EvalBeforeNestingBit }; unary_evaluator(const InverseType& inv_xpr) @@ -106,11 +105,11 @@ struct unary_evaluator > ::new (static_cast(this)) Base(m_result); internal::call_assignment_no_alias(m_result, inv_xpr); } - + protected: PlainObject m_result; }; - + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/Map.h b/inst/include/Eigen/src/Core/Map.h index 548bf9a2..218cc157 100644 --- a/inst/include/Eigen/src/Core/Map.h +++ b/inst/include/Eigen/src/Core/Map.h @@ -11,7 +11,7 @@ #ifndef EIGEN_MAP_H #define EIGEN_MAP_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -47,7 +47,7 @@ struct traits > * \brief A matrix or vector expression mapping an existing array of data. * * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. * The default is \c #Unaligned. * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout * of an ordinary, contiguous array. This can be overridden by specifying strides. @@ -104,19 +104,19 @@ template class Ma EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { - return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer() - : int(internal::traits::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) + return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() + : internal::traits::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) - : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride()) + : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); } diff --git a/inst/include/Eigen/src/Core/MapBase.h b/inst/include/Eigen/src/Core/MapBase.h index 92c3b281..d856447f 100644 --- a/inst/include/Eigen/src/Core/MapBase.h +++ b/inst/include/Eigen/src/Core/MapBase.h @@ -15,7 +15,7 @@ EIGEN_STATIC_ASSERT((int(internal::evaluator::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \ YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT) -namespace Eigen { +namespace Eigen { /** \ingroup Core_Module * @@ -87,9 +87,11 @@ template class MapBase typedef typename Base::CoeffReturnType CoeffReturnType; /** \copydoc DenseBase::rows() */ - EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); } /** \copydoc DenseBase::cols() */ - EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); } /** Returns a pointer to the first coefficient of the matrix or vector. * diff --git a/inst/include/Eigen/src/Core/MathFunctions.h b/inst/include/Eigen/src/Core/MathFunctions.h index 01736c2a..61b78f4f 100644 --- a/inst/include/Eigen/src/Core/MathFunctions.h +++ b/inst/include/Eigen/src/Core/MathFunctions.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2006-2010 Benoit Jacob +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,10 +11,11 @@ #ifndef EIGEN_MATHFUNCTIONS_H #define EIGEN_MATHFUNCTIONS_H -// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html // TODO this should better be moved to NumTraits -#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L - +// Source: WolframAlpha +#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L +#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L +#define EIGEN_LN2 0.693147180559945309417232121458176568075500134360255254120680009493393621L namespace Eigen { @@ -97,7 +99,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct real_impl > { @@ -145,7 +147,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct imag_impl > { @@ -213,12 +215,12 @@ struct imag_ref_default_impl template struct imag_ref_default_impl { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Scalar run(Scalar&) { return Scalar(0); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline const Scalar run(const Scalar&) { return Scalar(0); @@ -239,7 +241,7 @@ struct imag_ref_retval ****************************************************************************/ template::IsComplex> -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -249,7 +251,7 @@ struct conj_impl }; template -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -259,6 +261,9 @@ struct conj_impl } }; +template::IsComplex> +struct conj_impl : conj_default_impl {}; + template struct conj_retval { @@ -308,6 +313,65 @@ struct abs2_retval typedef typename NumTraits::Real type; }; +/**************************************************************************** +* Implementation of sqrt/rsqrt * +****************************************************************************/ + +template +struct sqrt_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x) + { + EIGEN_USING_STD(sqrt); + return sqrt(x); + } +}; + +// Complex sqrt defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_sqrt(const std::complex& a_x); + +// Custom implementation is faster than `std::sqrt`, works on +// GPU, and correctly handles special cases (unlike MSVC). +template +struct sqrt_impl > +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE std::complex run(const std::complex& x) + { + return complex_sqrt(x); + } +}; + +template +struct sqrt_retval +{ + typedef Scalar type; +}; + +// Default implementation relies on numext::sqrt, at bottom of file. +template +struct rsqrt_impl; + +// Complex rsqrt defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& a_x); + +template +struct rsqrt_impl > +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE std::complex run(const std::complex& x) + { + return complex_rsqrt(x); + } +}; + +template +struct rsqrt_retval +{ + typedef Scalar type; +}; + /**************************************************************************** * Implementation of norm1 * ****************************************************************************/ @@ -322,7 +386,7 @@ struct norm1_default_impl EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return abs(x.real()) + abs(x.imag()); } }; @@ -333,7 +397,7 @@ struct norm1_default_impl EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return abs(x); } }; @@ -363,7 +427,7 @@ struct hypot_retval * Implementation of cast * ****************************************************************************/ -template +template struct cast_impl { EIGEN_DEVICE_FUNC @@ -373,6 +437,22 @@ struct cast_impl } }; +// Casting from S -> Complex leads to an implicit conversion from S to T, +// generating warnings on clang. Here we explicitly cast the real component. +template +struct cast_impl::IsComplex && NumTraits::IsComplex + >::type> +{ + EIGEN_DEVICE_FUNC + static inline NewType run(const OldType& x) + { + typedef typename NumTraits::Real NewReal; + return static_cast(static_cast(x)); + } +}; + // here, for once, we're plainly returning NewType: we don't want cast to do weird things. template @@ -386,29 +466,59 @@ inline NewType cast(const OldType& x) * Implementation of round * ****************************************************************************/ +template +struct round_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) #if EIGEN_HAS_CXX11_MATH - template - struct round_impl { - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - using std::round; - return round(x); - } - }; + EIGEN_USING_STD(round); +#endif + return Scalar(round(x)); + } +}; + +#if !EIGEN_HAS_CXX11_MATH +#if EIGEN_HAS_C99_MATH +// Use ::roundf for float. +template<> +struct round_impl { + EIGEN_DEVICE_FUNC + static inline float run(const float& x) + { + return ::roundf(x); + } +}; #else - template - struct round_impl +template +struct round_using_floor_ceil_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) { - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - EIGEN_USING_STD_MATH(floor); - EIGEN_USING_STD_MATH(ceil); - return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5)); + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) + // Without C99 round/roundf, resort to floor/ceil. + EIGEN_USING_STD(floor); + EIGEN_USING_STD(ceil); + // If not enough precision to resolve a decimal at all, return the input. + // Otherwise, adding 0.5 can trigger an increment by 1. + const Scalar limit = Scalar(1ull << (NumTraits::digits() - 1)); + if (x >= limit || x <= -limit) { + return x; } - }; -#endif + return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5))); + } +}; + +template<> +struct round_impl : round_using_floor_ceil_impl {}; + +template<> +struct round_impl : round_using_floor_ceil_impl {}; +#endif // EIGEN_HAS_C99_MATH +#endif // !EIGEN_HAS_CXX11_MATH template struct round_retval @@ -417,43 +527,112 @@ struct round_retval }; /**************************************************************************** -* Implementation of arg * +* Implementation of rint * ****************************************************************************/ +template +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) #if EIGEN_HAS_CXX11_MATH - template - struct arg_impl { - static inline Scalar run(const Scalar& x) - { - EIGEN_USING_STD_MATH(arg); - return arg(x); - } - }; -#else - template::IsComplex> - struct arg_default_impl + EIGEN_USING_STD(rint); +#endif + return rint(x); + } +}; + +#if !EIGEN_HAS_CXX11_MATH +template<> +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline double run(const double& x) { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) - { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); } - }; + return ::rint(x); + } +}; +template<> +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline float run(const float& x) + { + return ::rintf(x); + } +}; +#endif - template - struct arg_default_impl +template +struct rint_retval +{ + typedef Scalar type; +}; + +/**************************************************************************** +* Implementation of arg * +****************************************************************************/ + +// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs. +// This seems to be fixed in VS 2019. +#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920) +// std::arg is only defined for types of std::complex, or integer types or float/double/long double +template::IsComplex || is_integral::value + || is_same::value || is_same::value + || is_same::value > +struct arg_default_impl; + +template +struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) - { - EIGEN_USING_STD_MATH(arg); - return arg(x); - } - }; + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else + EIGEN_USING_STD(arg); + #endif + return static_cast(arg(x)); + } +}; + +// Must be non-complex floating-point type (e.g. half/bfloat16). +template +struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); + } +}; +#else +template::IsComplex> +struct arg_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); + } +}; - template struct arg_impl : arg_default_impl {}; +template +struct arg_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + EIGEN_USING_STD(arg); + return arg(x); + } +}; #endif +template struct arg_impl : arg_default_impl {}; template struct arg_retval @@ -461,6 +640,80 @@ struct arg_retval typedef typename NumTraits::Real type; }; +/**************************************************************************** +* Implementation of expm1 * +****************************************************************************/ + +// This implementation is based on GSL Math's expm1. +namespace std_fallback { + // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar, + // or that there is no suitable std::expm1 function available. Implementation + // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php. + template + EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + typedef typename NumTraits::Real RealScalar; + + EIGEN_USING_STD(exp); + Scalar u = exp(x); + if (numext::equal_strict(u, Scalar(1))) { + return x; + } + Scalar um1 = u - RealScalar(1); + if (numext::equal_strict(um1, Scalar(-1))) { + return RealScalar(-1); + } + + EIGEN_USING_STD(log); + Scalar logu = log(u); + return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu; + } +} + +template +struct expm1_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + #if EIGEN_HAS_CXX11_MATH + using std::expm1; + #else + using std_fallback::expm1; + #endif + return expm1(x); + } +}; + +template +struct expm1_retval +{ + typedef Scalar type; +}; + +/**************************************************************************** +* Implementation of log * +****************************************************************************/ + +// Complex log defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z); + +template +struct log_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_USING_STD(log); + return static_cast(log(x)); + } +}; + +template +struct log_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run(const std::complex& z) + { + return complex_log(z); + } +}; + /**************************************************************************** * Implementation of log1p * ****************************************************************************/ @@ -472,25 +725,38 @@ namespace std_fallback { EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) typedef typename NumTraits::Real RealScalar; - EIGEN_USING_STD_MATH(log); + EIGEN_USING_STD(log); Scalar x1p = RealScalar(1) + x; - return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + Scalar log_1p = log_impl::run(x1p); + const bool is_small = numext::equal_strict(x1p, Scalar(1)); + const bool is_inf = numext::equal_strict(x1p, log_1p); + return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); } } template struct log1p_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH using std::log1p; - #endif + #else using std_fallback::log1p; + #endif return log1p(x); } }; +// Specialization for complex types that are not supported by std::log1p. +template +struct log1p_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run( + const std::complex& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + return std_fallback::log1p(x); + } +}; template struct log1p_retval @@ -509,7 +775,7 @@ struct pow_impl typedef typename ScalarBinaryOpTraits >::ReturnType result_type; static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y) { - EIGEN_USING_STD_MATH(pow); + EIGEN_USING_STD(pow); return pow(x, y); } }; @@ -687,7 +953,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); } -// Implementatin of is* functions +// Implementation of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) @@ -716,7 +982,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -731,7 +997,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -746,7 +1012,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -803,7 +1069,6 @@ template EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x) template EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x); template T generic_fast_tanh_float(const T& a_x); - } // end namespace internal /**************************************************************************** @@ -812,12 +1077,12 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#ifndef __CUDA_ARCH__ +#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - EIGEN_USING_STD_MATH(min); + EIGEN_USING_STD(min) return min EIGEN_NOT_A_MACRO (x,y); } @@ -825,7 +1090,7 @@ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - EIGEN_USING_STD_MATH(max); + EIGEN_USING_STD(max) return max EIGEN_NOT_A_MACRO (x,y); } #else @@ -841,6 +1106,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return fmin(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fminl" on HIP yet + return (x < y) ? x : y; +#else + return fminl(x, y); +#endif +} + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -853,6 +1136,92 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return fmax(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fmaxl" on HIP yet + return (x > y) ? x : y; +#else + return fmaxl(x, y); +#endif +} +#endif + +#if defined(SYCL_DEVICE_ONLY) + + +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) +#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) +#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) + +#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ +template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \ + return cl::sycl::FUNC(x); \ + } + +#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE) + +#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \ + template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \ + return cl::sycl::FUNC(x, y); \ + } + +#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ + SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE) + +#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE) + +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin) +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax) + #endif @@ -922,6 +1291,34 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) EIGEN_DEVICE_FUNC inline bool abs2(bool x) { return x; } +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) +{ + return x > y ? x - y : y - x; +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y) +{ + return fabsf(x - y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) +{ + return fabs(x - y); +} + +#if !defined(EIGEN_GPUCC) +// HIP and CUDA do not support long double. +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) { + return fabsl(x - y); +} +#endif + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) @@ -936,6 +1333,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); } +#if defined(SYCL_DEVICE_ONLY) + SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot) +#endif + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) @@ -943,7 +1344,11 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -958,10 +1363,27 @@ inline typename internal::pow_impl::result_type pow(const Scala return internal::pow_impl::run(x, y); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow) +#endif + template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); } template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return internal::isinf_impl(x); } template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool) +#endif + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x); +} + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) @@ -969,15 +1391,23 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round) +#endif + template EIGEN_DEVICE_FUNC T (floor)(const T& x) { - EIGEN_USING_STD_MATH(floor); + EIGEN_USING_STD(floor) return floor(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -989,11 +1419,15 @@ template EIGEN_DEVICE_FUNC T (ceil)(const T& x) { - EIGEN_USING_STD_MATH(ceil); + EIGEN_USING_STD(ceil); return ceil(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1026,22 +1460,42 @@ inline int log2(int x) * * It's usage is justified in performance critical functions, like norm/normalize. */ +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x); +} + +// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool). +template<> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +bool sqrt(const bool &x) { return x; } + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt) +#endif + +/** \returns the reciprocal square root of \a x. **/ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T sqrt(const T &x) +T rsqrt(const T& x) { - EIGEN_USING_STD_MATH(sqrt); - return sqrt(x); + return internal::rsqrt_impl::run(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x) { - EIGEN_USING_STD_MATH(log); - return log(x); + return internal::log_impl::run(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log) +#endif + + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1053,7 +1507,7 @@ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type abs(const T &x) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return abs(x); } @@ -1064,12 +1518,12 @@ abs(const T &x) { return x; } -#if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } -EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } -#endif // defined(__SYCL_DEVICE_ONLY__) +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs) +#endif -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1090,26 +1544,69 @@ double abs(const std::complex& x) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T &x) { - EIGEN_USING_STD_MATH(exp); + EIGEN_USING_STD(exp); return exp(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + float com = ::expf(x.real()); + float res_real = com * ::cosf(x.imag()); + float res_imag = com * ::sinf(x.imag()); + return std::complex(res_real, res_imag); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + double com = ::exp(x.real()); + double res_real = com * ::cos(x.imag()); + double res_imag = com * ::sin(x.imag()); + return std::complex(res_real, res_imag); +} +#endif + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x); +} + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1) +#endif + +#if defined(EIGEN_GPUCC) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float expm1(const float &x) { return ::expm1f(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double expm1(const double &x) { return ::expm1(x); } #endif template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T &x) { - EIGEN_USING_STD_MATH(cos); + EIGEN_USING_STD(cos); return cos(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1120,11 +1617,15 @@ double cos(const double &x) { return ::cos(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T &x) { - EIGEN_USING_STD_MATH(sin); + EIGEN_USING_STD(sin); return sin(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1135,11 +1636,15 @@ double sin(const double &x) { return ::sin(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T &x) { - EIGEN_USING_STD_MATH(tan); + EIGEN_USING_STD(tan); return tan(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1150,11 +1655,25 @@ double tan(const double &x) { return ::tan(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T &x) { - EIGEN_USING_STD_MATH(acos); + EIGEN_USING_STD(acos); return acos(x); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T acosh(const T &x) { + EIGEN_USING_STD(acosh); + return static_cast(acosh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1165,11 +1684,25 @@ double acos(const double &x) { return ::acos(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T &x) { - EIGEN_USING_STD_MATH(asin); + EIGEN_USING_STD(asin); return asin(x); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T asinh(const T &x) { + EIGEN_USING_STD(asinh); + return static_cast(asinh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1180,11 +1713,25 @@ double asin(const double &x) { return ::asin(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T &x) { - EIGEN_USING_STD_MATH(atan); - return atan(x); + EIGEN_USING_STD(atan); + return static_cast(atan(x)); +} + +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atanh(const T &x) { + EIGEN_USING_STD(atanh); + return static_cast(atanh(x)); } +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh) +#endif -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1196,11 +1743,15 @@ double atan(const double &x) { return ::atan(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T &x) { - EIGEN_USING_STD_MATH(cosh); - return cosh(x); + EIGEN_USING_STD(cosh); + return static_cast(cosh(x)); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1211,11 +1762,15 @@ double cosh(const double &x) { return ::cosh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T &x) { - EIGEN_USING_STD_MATH(sinh); - return sinh(x); + EIGEN_USING_STD(sinh); + return static_cast(sinh(x)); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1226,16 +1781,20 @@ double sinh(const double &x) { return ::sinh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T &x) { - EIGEN_USING_STD_MATH(tanh); + EIGEN_USING_STD(tanh); return tanh(x); } -#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH +#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY) EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1246,11 +1805,15 @@ double tanh(const double &x) { return ::tanh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) { - EIGEN_USING_STD_MATH(fmod); + EIGEN_USING_STD(fmod); return fmod(a, b); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod) +#endif + +#if defined(EIGEN_GPUCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { @@ -1264,6 +1827,23 @@ double fmod(const double& a, const double& b) { } #endif +#if defined(SYCL_DEVICE_ONLY) +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE +#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC +#undef SYCL_SPECIALIZE_UNARY_FUNC +#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC +#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC +#undef SYCL_SPECIALIZE_BINARY_FUNC +#endif + } // end namespace numext namespace internal { @@ -1387,18 +1967,23 @@ template<> struct random_impl { return random(0,1)==0 ? false : true; } + + static inline bool run(const bool& a, const bool& b) + { + return random(a, b)==0 ? false : true; + } }; template<> struct scalar_fuzzy_impl { typedef bool RealScalar; - + template EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&) { return !x; } - + EIGEN_DEVICE_FUNC static inline bool isApprox(bool x, bool y, bool) { @@ -1410,10 +1995,61 @@ template<> struct scalar_fuzzy_impl { return (!x) || y; } - + +}; + +} // end namespace internal + +// Default implementations that rely on other numext implementations +namespace internal { + +// Specialization for complex types that are not supported by std::expm1. +template +struct expm1_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run( + const std::complex& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + RealScalar xr = x.real(); + RealScalar xi = x.imag(); + // expm1(z) = exp(z) - 1 + // = exp(x + i * y) - 1 + // = exp(x) * (cos(y) + i * sin(y)) - 1 + // = exp(x) * cos(y) - 1 + i * exp(x) * sin(y) + // Imag(expm1(z)) = exp(x) * sin(y) + // Real(expm1(z)) = exp(x) * cos(y) - 1 + // = exp(x) * cos(y) - 1. + // = expm1(x) + exp(x) * (cos(y) - 1) + // = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2) + RealScalar erm1 = numext::expm1(xr); + RealScalar er = erm1 + RealScalar(1.); + RealScalar sin2 = numext::sin(xi / RealScalar(2.)); + sin2 = sin2 * sin2; + RealScalar s = numext::sin(xi); + RealScalar real_part = erm1 - RealScalar(2.) * er * sin2; + return std::complex(real_part, er * s); + } }; - +template +struct rsqrt_impl { + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE T run(const T& x) { + return T(1)/numext::sqrt(x); + } +}; + +#if defined(EIGEN_GPU_COMPILE_PHASE) +template +struct conj_impl, true> +{ + EIGEN_DEVICE_FUNC + static inline std::complex run(const std::complex& x) + { + return std::complex(numext::real(x), -numext::imag(x)); + } +}; +#endif + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/MathFunctionsImpl.h b/inst/include/Eigen/src/Core/MathFunctionsImpl.h index 9c1ceb0e..4eaaaa78 100644 --- a/inst/include/Eigen/src/Core/MathFunctionsImpl.h +++ b/inst/include/Eigen/src/Core/MathFunctionsImpl.h @@ -17,24 +17,28 @@ namespace internal { /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which - is accurate up to a couple of ulp in the range [-9, 9], outside of which - the tanh(x) = +/-1. + is accurate up to a couple of ulps in the (approximate) range [-8, 8], + outside of which tanh(x) = +/-1 in single precision. The input is clamped + to the range [-c, c]. The value c is chosen as the smallest value where + the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004] + the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero. This implementation works on both scalars and packets. */ template T generic_fast_tanh_float(const T& a_x) { - // Clamp the inputs to the range [-9, 9] since anything outside - // this range is +/-1.0f in single-precision. - const T plus_9 = pset1(9.f); - const T minus_9 = pset1(-9.f); - // NOTE GCC prior to 6.3 might improperly optimize this max/min - // step such that if a_x is nan, x will be either 9 or -9, - // and tanh will return 1 or -1 instead of nan. - // This is supposed to be fixed in gcc6.3, - // see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - const T x = pmax(minus_9,pmin(plus_9,a_x)); + // Clamp the inputs to the range [-c, c] +#ifdef EIGEN_VECTORIZE_FMA + const T plus_clamp = pset1(7.99881172180175781f); + const T minus_clamp = pset1(-7.99881172180175781f); +#else + const T plus_clamp = pset1(7.90531110763549805f); + const T minus_clamp = pset1(-7.90531110763549805f); +#endif + const T tiny = pset1(0.0004f); + const T x = pmax(pmin(a_x, plus_clamp), minus_clamp); + const T tiny_mask = pcmp_lt(pabs(a_x), tiny); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(4.89352455891786e-03f); const T alpha_3 = pset1(6.37261928875436e-04f); @@ -62,24 +66,30 @@ T generic_fast_tanh_float(const T& a_x) p = pmadd(x2, p, alpha_1); p = pmul(x, p); - // Evaluate the denominator polynomial p. + // Evaluate the denominator polynomial q. T q = pmadd(x2, beta_6, beta_4); q = pmadd(x2, q, beta_2); q = pmadd(x2, q, beta_0); // Divide the numerator by the denominator. - return pdiv(p, q); + return pselect(tiny_mask, x, pdiv(p, q)); } template -EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) { - EIGEN_USING_STD_MATH(sqrt); + // IEEE IEC 6059 special cases. + if ((numext::isinf)(x) || (numext::isinf)(y)) + return NumTraits::infinity(); + if ((numext::isnan)(x) || (numext::isnan)(y)) + return NumTraits::quiet_NaN(); + + EIGEN_USING_STD(sqrt); RealScalar p, qp; p = numext::maxi(x,y); if(p==RealScalar(0)) return RealScalar(0); - qp = numext::mini(y,x) / p; + qp = numext::mini(y,x) / p; return p * sqrt(RealScalar(1) + qp*qp); } @@ -87,13 +97,102 @@ template struct hypot_impl { typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) + static EIGEN_DEVICE_FUNC + inline RealScalar run(const Scalar& x, const Scalar& y) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return positive_real_hypot(abs(x), abs(y)); } }; +// Generic complex sqrt implementation that correctly handles corner cases +// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt +template +EIGEN_DEVICE_FUNC std::complex complex_sqrt(const std::complex& z) { + // Computes the principal sqrt of the input. + // + // For a complex square root of the number x + i*y. We want to find real + // numbers u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = y / (2 * u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = y / (2 * v) + // + // Letting w = sqrt(0.5 * (|x| + |z|)), + // if x == 0: u = w, v = sign(y) * w + // if x > 0: u = w, v = y / (2 * w) + // if x < 0: u = |y| / (2 * w), v = sign(y) * w + + const T x = numext::real(z); + const T y = numext::imag(z); + const T zero = T(0); + const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y))); + + return + (numext::isinf)(y) ? std::complex(NumTraits::infinity(), y) + : x == zero ? std::complex(w, y < zero ? -w : w) + : x > zero ? std::complex(w, y / (2 * w)) + : std::complex(numext::abs(y) / (2 * w), y < zero ? -w : w ); +} + +// Generic complex rsqrt implementation. +template +EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& z) { + // Computes the principal reciprocal sqrt of the input. + // + // For a complex reciprocal square root of the number z = x + i*y. We want to + // find real numbers u and v such that + // (u + i*v)^2 = 1 / (x + i*y) <=> + // u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x/|z|^2 + // 2*u*v = y/|z|^2. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + |z|)) / |z| + // v = -y / (2 * u * |z|) + // and for x < 0, + // v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z| + // u = -y / (2 * v * |z|) + // + // Letting w = sqrt(0.5 * (|x| + |z|)), + // if x == 0: u = w / |z|, v = -sign(y) * w / |z| + // if x > 0: u = w / |z|, v = -y / (2 * w * |z|) + // if x < 0: u = |y| / (2 * w * |z|), v = -sign(y) * w / |z| + + const T x = numext::real(z); + const T y = numext::imag(z); + const T zero = T(0); + + const T abs_z = numext::hypot(x, y); + const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z)); + const T woz = w / abs_z; + // Corner cases consistent with 1/sqrt(z) on gcc/clang. + return + abs_z == zero ? std::complex(NumTraits::infinity(), NumTraits::quiet_NaN()) + : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex(zero, zero) + : x == zero ? std::complex(woz, y < zero ? woz : -woz) + : x > zero ? std::complex(woz, -y / (2 * w * abs_z)) + : std::complex(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz ); +} + +template +EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z) { + // Computes complex log. + T a = numext::abs(z); + EIGEN_USING_STD(atan2); + T b = atan2(z.imag(), z.real()); + return std::complex(numext::log(a), b); +} + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/Matrix.h b/inst/include/Eigen/src/Core/Matrix.h index 7f4a7af9..f0e59a91 100644 --- a/inst/include/Eigen/src/Core/Matrix.h +++ b/inst/include/Eigen/src/Core/Matrix.h @@ -29,7 +29,7 @@ struct traits > required_alignment = unpacket_traits::alignment, packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0 }; - + public: typedef _Scalar Scalar; typedef Dense StorageKind; @@ -44,7 +44,7 @@ struct traits > Options = _Options, InnerStrideAtCompileTime = 1, OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, - + // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, Alignment = actual_alignment @@ -255,53 +255,93 @@ class Matrix * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix() : Base() + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix() : Base() { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } // FIXME is it still needed - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(internal::constructor_without_unaligned_array_assert) : Base(internal::constructor_without_unaligned_array_assert()) { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } #if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) : Base(std::move(other)) { Base::_check_template_params(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { - other.swap(*this); + Base::operator=(std::move(other)); return *this; } #endif - #ifndef EIGEN_PARSED_BY_DOXYGEN +#if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&... args) + * + * Example: \include Matrix_variadic_ctor_cxx11.cpp + * Output: \verbinclude Matrix_variadic_ctor_cxx11.out + * + * \sa Matrix(const std::initializer_list>&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Matrix_initializer_list_23_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column vector, implicit transposition from a single row is allowed. + * Therefore VectorXd{{1,2,3,4,5}} is legal and the more verbose syntax + * RowVectorXd{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Matrix_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, + * and implicit transposition is allowed for compile-time vectors only. + * + * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} +#endif // end EIGEN_HAS_CXX11 + +#ifndef EIGEN_PARSED_BY_DOXYGEN // This constructor is for both 1x1 matrices and dynamic vectors template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit Matrix(const T& x) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit Matrix(const T& x) { Base::_check_template_params(); Base::template _init1(x); } template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const T0& x, const T1& y) { Base::_check_template_params(); Base::template _init2(x, y); } - #else + + +#else /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Matrix(const Scalar *data); @@ -311,7 +351,7 @@ class Matrix * This is useful for dynamic-size vectors. For fixed-size vectors, * it is redundant to pass these parameters, so one should use the default constructor * Matrix() instead. - * + * * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance, * calling Matrix(1) will call the initialization constructor: Matrix(const Scalar&). * For fixed-size \c 1x1 matrices it is therefore recommended to use the default @@ -319,14 +359,15 @@ class Matrix * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives). */ EIGEN_STRONG_INLINE explicit Matrix(Index dim); - /** \brief Constructs an initialized 1x1 matrix with the given coefficient */ + /** \brief Constructs an initialized 1x1 matrix with the given coefficient + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x); /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns. * * This is useful for dynamic-size matrices. For fixed-size matrices, * it is redundant to pass these parameters, so one should use the default constructor * Matrix() instead. - * + * * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance, * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y). * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default @@ -335,12 +376,15 @@ class Matrix */ EIGEN_DEVICE_FUNC Matrix(Index rows, Index cols); - - /** \brief Constructs an initialized 2D vector with given coefficients */ + + /** \brief Constructs an initialized 2D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x, const Scalar& y); - #endif + #endif // end EIGEN_PARSED_BY_DOXYGEN - /** \brief Constructs an initialized 3D vector with given coefficients */ + /** \brief Constructs an initialized 3D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) { @@ -350,7 +394,9 @@ class Matrix m_storage.data()[1] = y; m_storage.data()[2] = z; } - /** \brief Constructs an initialized 4D vector with given coefficients */ + /** \brief Constructs an initialized 4D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) { @@ -377,8 +423,10 @@ class Matrix : Base(other.derived()) { } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } /////////// Geometry module /////////// @@ -405,7 +453,7 @@ class Matrix * * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common matrix and vector types. + * %Eigen defines several typedef shortcuts for most common matrix and vector types. * * The general patterns are the following: * @@ -418,6 +466,15 @@ class Matrix * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is * a fixed-size vector of 4 complex floats. * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `MatrixSize` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size. + * - `MatrixXSize` and `MatrixSizeX` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices. + * - `VectorSize` and `RowVectorSize` for column and row vectors. + * + * With \cpp11, you can also use fully generic column and row vector types: `Vector` and `RowVector`. + * * \sa class Matrix */ @@ -454,6 +511,55 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS +#if EIGEN_HAS_CXX11 + +#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Vector##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using RowVector##SizeSuffix = Matrix; + +#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##Size##X = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##X##Size = Matrix; + +EIGEN_MAKE_TYPEDEFS(2, 2) +EIGEN_MAKE_TYPEDEFS(3, 3) +EIGEN_MAKE_TYPEDEFS(4, 4) +EIGEN_MAKE_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_FIXED_TYPEDEFS(2) +EIGEN_MAKE_FIXED_TYPEDEFS(3) +EIGEN_MAKE_FIXED_TYPEDEFS(4) + +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using Vector = Matrix; + +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using RowVector = Matrix; + +#undef EIGEN_MAKE_TYPEDEFS +#undef EIGEN_MAKE_FIXED_TYPEDEFS + +#endif // EIGEN_HAS_CXX11 + } // end namespace Eigen #endif // EIGEN_MATRIX_H diff --git a/inst/include/Eigen/src/Core/MatrixBase.h b/inst/include/Eigen/src/Core/MatrixBase.h index f8bcc8c6..45c3a596 100644 --- a/inst/include/Eigen/src/Core/MatrixBase.h +++ b/inst/include/Eigen/src/Core/MatrixBase.h @@ -76,6 +76,7 @@ template class MatrixBase using Base::coeffRef; using Base::lazyAssign; using Base::eval; + using Base::operator-; using Base::operator+=; using Base::operator-=; using Base::operator*=; @@ -122,7 +123,6 @@ template class MatrixBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) -# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/MatrixCwiseBinaryOps.h" @@ -268,6 +268,8 @@ template class MatrixBase Derived& setIdentity(); EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setUnit(Index i); + EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i); bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; @@ -296,7 +298,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } - NoAlias noalias(); + NoAlias EIGEN_DEVICE_FUNC noalias(); // TODO forceAlignedAccess is temporarily disabled // Need to find a nicer workaround. @@ -326,6 +328,7 @@ template class MatrixBase inline const PartialPivLU lu() const; + EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template @@ -335,12 +338,15 @@ template class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + template inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + + EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -412,15 +418,19 @@ template class MatrixBase ////////// Householder module /////////// + EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template + EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); @@ -428,8 +438,10 @@ template class MatrixBase ///////// Jacobi module ///////// template + EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); template + EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation& j); ///////// SparseCore module ///////// @@ -456,6 +468,11 @@ template class MatrixBase const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) +#if EIGEN_HAS_CXX11_MATH + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine) +#endif EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) diff --git a/inst/include/Eigen/src/Core/NestByValue.h b/inst/include/Eigen/src/Core/NestByValue.h index 13adf070..b4275768 100644 --- a/inst/include/Eigen/src/Core/NestByValue.h +++ b/inst/include/Eigen/src/Core/NestByValue.h @@ -16,7 +16,11 @@ namespace Eigen { namespace internal { template struct traits > : public traits -{}; +{ + enum { + Flags = traits::Flags & ~NestByRefBit + }; +}; } /** \class NestByValue @@ -41,57 +45,13 @@ template class NestByValue EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const - { - return m_expression.coeff(row, col); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) - { - return m_expression.const_cast_derived().coeffRef(row, col); - } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const - { - return m_expression.coeff(index); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) - { - return m_expression.const_cast_derived().coeffRef(index); - } - - template - inline const PacketScalar packet(Index row, Index col) const - { - return m_expression.template packet(row, col); - } - - template - inline void writePacket(Index row, Index col, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(row, col, x); - } - - template - inline const PacketScalar packet(Index index) const - { - return m_expression.template packet(index); - } - - template - inline void writePacket(Index index, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(index, x); - } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; } + EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; } + protected: const ExpressionType m_expression; }; @@ -99,12 +59,27 @@ template class NestByValue /** \returns an expression of the temporary version of *this. */ template -inline const NestByValue +EIGEN_DEVICE_FUNC inline const NestByValue DenseBase::nestByValue() const { return NestByValue(derived()); } +namespace internal { + +// Evaluator of Solve -> eval into a temporary +template +struct evaluator > + : public evaluator +{ + typedef evaluator Base; + + EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue& xpr) + : Base(xpr.nestedExpression()) + {} +}; +} + } // end namespace Eigen #endif // EIGEN_NESTBYVALUE_H diff --git a/inst/include/Eigen/src/Core/NoAlias.h b/inst/include/Eigen/src/Core/NoAlias.h index 33908010..570283d9 100644 --- a/inst/include/Eigen/src/Core/NoAlias.h +++ b/inst/include/Eigen/src/Core/NoAlias.h @@ -33,6 +33,7 @@ class NoAlias public: typedef typename ExpressionType::Scalar Scalar; + EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} template @@ -74,10 +75,10 @@ class NoAlias * * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * Currently, even though several expressions may alias, only product - * expressions have this flag. Therefore, noalias() is only usefull when + * expressions have this flag. Therefore, noalias() is only useful when * the source expression contains a matrix product. * - * Here are some examples where noalias is usefull: + * Here are some examples where noalias is useful: * \code * D.noalias() = A * B; * D.noalias() += A.transpose() * B; @@ -98,7 +99,7 @@ class NoAlias * \sa class NoAlias */ template -NoAlias MatrixBase::noalias() +NoAlias EIGEN_DEVICE_FUNC MatrixBase::noalias() { return NoAlias(derived()); } diff --git a/inst/include/Eigen/src/Core/NumTraits.h b/inst/include/Eigen/src/Core/NumTraits.h index daf48987..72eac5a9 100644 --- a/inst/include/Eigen/src/Core/NumTraits.h +++ b/inst/include/Eigen/src/Core/NumTraits.h @@ -21,12 +21,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits10_impl { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits::digits10; } }; template struct default_digits10_impl // Floating point { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { using std::log10; using std::ceil; @@ -38,11 +40,64 @@ struct default_digits10_impl // Floating point template struct default_digits10_impl // Integer { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { return 0; } +}; + + +// default implementation of digits(), based on numeric_limits if specialized, +// 0 for integer types, and log2(epsilon()) otherwise. +template< typename T, + bool use_numeric_limits = std::numeric_limits::is_specialized, + bool is_integer = NumTraits::IsInteger> +struct default_digits_impl +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { return std::numeric_limits::digits; } +}; + +template +struct default_digits_impl // Floating point +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { + using std::log; + using std::ceil; + typedef typename NumTraits::Real Real; + return int(ceil(-log(NumTraits::epsilon())/log(static_cast(2)))); + } +}; + +template +struct default_digits_impl // Integer +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; } }; } // end namespace internal +namespace numext { +/** \internal bit-wise cast without changing the underlying bit representation. */ + +// TODO: Replace by std::bit_cast (available in C++20) +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { +#if EIGEN_HAS_TYPE_TRAITS + // The behaviour of memcpy is not specified for non-trivially copyable types + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value, THIS_TYPE_IS_NOT_SUPPORTED); + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value && std::is_default_constructible::value, + THIS_TYPE_IS_NOT_SUPPORTED); +#endif + + EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED); + Tgt tgt; + EIGEN_USING_STD(memcpy) + memcpy(&tgt, &src, sizeof(Tgt)); + return tgt; +} +} // namespace numext + /** \class NumTraits * \ingroup Core_Module * @@ -71,7 +126,7 @@ struct default_digits10_impl // Integer * and to \c 0 otherwise. * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed * to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers. - * Stay vague here. No need to do architecture-specific stuff. + * Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost. * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned. * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must * be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise. @@ -80,9 +135,18 @@ struct default_digits10_impl // Integer * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default * value by the fuzzy comparison operators. * \li highest() and lowest() functions returning the highest and lowest possible values respectively. + * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is + * the analogue of std::numeric_limits::digits + * which is used as the default implementation if specialized. * \li digits10() function returning the number of decimal digits that can be represented without change. This is * the analogue of std::numeric_limits::digits10 * which is used as the default implementation if specialized. + * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively, + * such that the radix raised to the power exponent-1 is a normalized floating-point number. These are equivalent to + * std::numeric_limits::min_exponent/ + * std::numeric_limits::max_exponent. + * \li infinity() function returning a representation of positive infinity, if available. + * \li quiet_NaN function returning a non-signaling "not-a-number", if available. */ template struct GenericNumTraits @@ -106,42 +170,60 @@ template struct GenericNumTraits typedef T Nested; typedef T Literal; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return numext::numeric_limits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return internal::default_digits10_impl::run(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int digits() + { + return internal::default_digits_impl::run(); + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int min_exponent() + { + return numext::numeric_limits::min_exponent; + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int max_exponent() + { + return numext::numeric_limits::max_exponent; + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { // make sure to override this for floating-point types return Real(0); } - - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T highest() { return (numext::numeric_limits::max)(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T lowest() { - return IsInteger ? (numext::numeric_limits::min)() : (-(numext::numeric_limits::max)()); + return IsInteger ? (numext::numeric_limits::min)() + : static_cast(-(numext::numeric_limits::max)()); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T infinity() { return numext::numeric_limits::infinity(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T quiet_NaN() { return numext::numeric_limits::quiet_NaN(); } @@ -153,19 +235,20 @@ template struct NumTraits : GenericNumTraits template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline float dummy_precision() { return 1e-5f; } }; template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline double dummy_precision() { return 1e-12; } }; template<> struct NumTraits : GenericNumTraits { + EIGEN_CONSTEXPR static inline long double dummy_precision() { return 1e-15l; } }; @@ -182,11 +265,11 @@ template struct NumTraits > MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost }; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { return NumTraits::dummy_precision(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return NumTraits::digits10(); } }; @@ -206,16 +289,17 @@ struct NumTraits > IsInteger = NumTraits::IsInteger, IsSigned = NumTraits::IsSigned, RequireInitialization = 1, - ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::ReadCost, - AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, - MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost + ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::ReadCost), + AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::AddCost), + MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::MulCost) }; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } + EIGEN_CONSTEXPR static inline int digits10() { return NumTraits::digits10(); } }; @@ -229,6 +313,7 @@ template<> struct NumTraits MulCost = HugeCost }; + EIGEN_CONSTEXPR static inline int digits10() { return 0; } private: @@ -243,6 +328,8 @@ template<> struct NumTraits // Empty specialization for void to allow template specialization based on NumTraits::Real with T==void and SFINAE. template<> struct NumTraits {}; +template<> struct NumTraits : GenericNumTraits {}; + } // end namespace Eigen #endif // EIGEN_NUMTRAITS_H diff --git a/inst/include/Eigen/src/Core/PartialReduxEvaluator.h b/inst/include/Eigen/src/Core/PartialReduxEvaluator.h new file mode 100644 index 00000000..29abf35b --- /dev/null +++ b/inst/include/Eigen/src/Core/PartialReduxEvaluator.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PARTIALREDUX_H +#define EIGEN_PARTIALREDUX_H + +namespace Eigen { + +namespace internal { + + +/*************************************************************************** +* +* This file provides evaluators for partial reductions. +* There are two modes: +* +* - scalar path: simply calls the respective function on the column or row. +* -> nothing special here, all the tricky part is handled by the return +* types of VectorwiseOp's members. They embed the functor calling the +* respective DenseBase's member function. +* +* - vectorized path: implements a packet-wise reductions followed by +* some (optional) processing of the outcome, e.g., division by n for mean. +* +* For the vectorized path let's observe that the packet-size and outer-unrolling +* are both decided by the assignement logic. So all we have to do is to decide +* on the inner unrolling. +* +* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, +* but be need to be careful to specify correct increment. +* +***************************************************************************/ + + +/* logic deciding a strategy for unrolling of vectorized paths */ +template +struct packetwise_redux_traits +{ + enum { + OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime, + Cost = OuterSize == Dynamic ? HugeCost + : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits::Cost, + Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling + }; + +}; + +/* Value to be returned when size==0 , by default let's return 0 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } + +/* For products the default is 1 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } + +/* Perform the actual reduction */ +template::Unrolling +> +struct packetwise_redux_impl; + +/* Perform the actual reduction with unrolling */ +template +struct packetwise_redux_impl +{ + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + PacketType run(const Evaluator &eval, const Func& func, Index /*size*/) + { + return redux_vec_unroller::OuterSize>::template run(eval,func); + } +}; + +/* Add a specialization of redux_vec_unroller for size==0 at compiletime. + * This specialization is not required for general reductions, which is + * why it is defined here. + */ +template +struct redux_vec_unroller +{ + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f) + { + return packetwise_redux_empty_value(f); + } +}; + +/* Perform the actual reduction for dynamic sizes */ +template +struct packetwise_redux_impl +{ + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + EIGEN_DEVICE_FUNC + static PacketType run(const Evaluator &eval, const Func& func, Index size) + { + if(size==0) + return packetwise_redux_empty_value(func); + + const Index size4 = (size-1)&(~3); + PacketType p = eval.template packetByOuterInner(0,0); + Index i = 1; + // This loop is optimized for instruction pipelining: + // - each iteration generates two independent instructions + // - thanks to branch prediction and out-of-order execution we have independent instructions across loops + for(; i(i+0,0),eval.template packetByOuterInner(i+1,0)), + func.packetOp(eval.template packetByOuterInner(i+2,0),eval.template packetByOuterInner(i+3,0)))); + for(; i(i,0)); + return p; + } +}; + +template< typename ArgType, typename MemberOp, int Direction> +struct evaluator > + : evaluator_base > +{ + typedef PartialReduxExpr XprType; + typedef typename internal::nested_eval::type ArgTypeNested; + typedef typename internal::add_const_on_value_type::type ConstArgTypeNested; + typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef typename ArgType::Scalar InputScalar; + typedef typename XprType::Scalar Scalar; + enum { + TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) + }; + typedef typename MemberOp::template Cost CostOpType; + enum { + CoeffReadCost = TraversalSize==Dynamic ? HugeCost + : TraversalSize==0 ? 1 + : int(TraversalSize) * int(evaluator::CoeffReadCost) + int(CostOpType::value), + + _ArgFlags = evaluator::Flags, + + _Vectorizable = bool(int(_ArgFlags)&PacketAccessBit) + && bool(MemberOp::Vectorizable) + && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0) + && (TraversalSize!=0), + + Flags = (traits::Flags&RowMajorBit) + | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) + | (_Vectorizable ? PacketAccessBit : 0) + | LinearAccessBit, + + Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized + }; + + EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) + : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value))); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const + { + return coeff(Direction==Vertical ? j : i); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const + { + return m_functor(m_arg.template subVector(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketType packet(Index i, Index j) const + { + return packet(Direction==Vertical ? j : i); + } + + template + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + PacketType packet(Index idx) const + { + enum { PacketSize = internal::unpacket_traits::size }; + typedef Block PanelType; + + PanelType panel(m_arg, + Direction==Vertical ? 0 : idx, + Direction==Vertical ? idx : 0, + Direction==Vertical ? m_arg.rows() : Index(PacketSize), + Direction==Vertical ? Index(PacketSize) : m_arg.cols()); + + // FIXME + // See bug 1612, currently if PacketSize==1 (i.e. complex with 128bits registers) then the storage-order of panel get reversed + // and methods like packetByOuterInner do not make sense anymore in this context. + // So let's just by pass "vectorization" in this case: + if(PacketSize==1) + return internal::pset1(coeff(idx)); + + typedef typename internal::redux_evaluator PanelEvaluator; + PanelEvaluator panel_eval(panel); + typedef typename MemberOp::BinaryOp BinaryOp; + PacketType p = internal::packetwise_redux_impl::template run(panel_eval,m_functor.binaryFunc(),m_arg.outerSize()); + return p; + } + +protected: + ConstArgTypeNested m_arg; + const MemberOp m_functor; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PARTIALREDUX_H diff --git a/inst/include/Eigen/src/Core/PermutationMatrix.h b/inst/include/Eigen/src/Core/PermutationMatrix.h index 47c06ba7..69401bf4 100644 --- a/inst/include/Eigen/src/Core/PermutationMatrix.h +++ b/inst/include/Eigen/src/Core/PermutationMatrix.h @@ -88,13 +88,13 @@ class PermutationBase : public EigenBase } /** \returns the number of rows */ - inline Index rows() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline Index cols() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline Index size() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template diff --git a/inst/include/Eigen/src/Core/PlainObjectBase.h b/inst/include/Eigen/src/Core/PlainObjectBase.h index 0f3632cf..e2ddbd1d 100644 --- a/inst/include/Eigen/src/Core/PlainObjectBase.h +++ b/inst/include/Eigen/src/Core/PlainObjectBase.h @@ -13,10 +13,10 @@ #if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO) # define EIGEN_INITIALIZE_COEFFS -# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i::quiet_NaN(); +# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i::quiet_NaN(); #else # undef EIGEN_INITIALIZE_COEFFS # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED @@ -104,7 +104,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Scalar Scalar; - + typedef typename internal::packet_traits::type PacketScalar; typedef typename NumTraits::Real RealScalar; typedef Derived DenseType; @@ -118,16 +118,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type using Base::IsVectorAtCompileTime; using Base::Flags; - template friend class Eigen::Map; - friend class Eigen::Map; typedef Eigen::Map MapType; - friend class Eigen::Map; typedef const Eigen::Map ConstMapType; -#if EIGEN_MAX_ALIGN_BYTES>0 - // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice. - friend class Eigen::Map; - friend class Eigen::Map; -#endif typedef Eigen::Map AlignedMapType; typedef const Eigen::Map ConstAlignedMapType; template struct StridedMapType { typedef Eigen::Map type; }; @@ -147,10 +139,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast(this); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); } /** This is an overloaded version of DenseCoeffsBase::coeff(Index,Index) const * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. @@ -358,7 +350,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resizeLike(const EigenBase& _other) { const OtherDerived& other = _other.derived(); @@ -383,7 +375,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will be uninitialized. */ EIGEN_DEVICE_FUNC @@ -440,7 +432,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will copied from \c other. */ template @@ -508,8 +500,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT { - using std::swap; - swap(m_storage, other.m_storage); + _check_template_params(); + m_storage = std::move(other.m_storage); return *this; } #endif @@ -526,6 +518,71 @@ class PlainObjectBase : public internal::dense_xpr_base::type // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + #if EIGEN_HAS_CXX11 + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 + * + * \only_for_vectors + * + * This constructor is for 1D array or vectors with more than 4 coefficients. + * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this + * constructor must match the the fixed number of rows (resp. columns) of \c *this. + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : m_storage() + { + _check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4); + m_storage.data()[0] = a0; + m_storage.data()[1] = a1; + m_storage.data()[2] = a2; + m_storage.data()[3] = a3; + Index i = 4; + auto x = {(m_storage.data()[i++] = args, 0)...}; + static_cast(x); + } + + /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list) + : m_storage() + { + _check_template_params(); + + size_t list_size = 0; + if (list.begin() != list.end()) { + list_size = list.begin()->size(); + } + + // This is to allow syntax like VectorXi {{1, 2, 3, 4}} + if (ColsAtCompileTime == 1 && list.size() == 1) { + eigen_assert(list_size == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + resize(list_size, ColsAtCompileTime); + std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data()); + } else { + eigen_assert(list.size() == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + eigen_assert(list_size == static_cast(ColsAtCompileTime) || ColsAtCompileTime == Dynamic); + resize(list.size(), list_size); + + Index row_index = 0; + for (const std::initializer_list& row : list) { + eigen_assert(list_size == row.size()); + Index col_index = 0; + for (const Scalar& e : row) { + coeffRef(row_index, col_index) = e; + ++col_index; + } + ++row_index; + } + } + } + #endif // end EIGEN_HAS_CXX11 + /** \sa PlainObjectBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC @@ -564,7 +621,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \copydetails DenseBase::operator=(const EigenBase &other) */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase &other) { _resize_to_match(other); @@ -652,18 +709,26 @@ class PlainObjectBase : public internal::dense_xpr_base::type using Base::setConstant; EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val); EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val); using Base::setZero; EIGEN_DEVICE_FUNC Derived& setZero(Index size); EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols); + EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t); using Base::setOnes; EIGEN_DEVICE_FUNC Derived& setOnes(Index size); EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols); + EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t); using Base::setRandom; Derived& setRandom(Index size); Derived& setRandom(Index rows, Index cols); + Derived& setRandom(NoChange_t, Index cols); + Derived& setRandom(Index rows, NoChange_t); #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN #include EIGEN_PLAINOBJECTBASE_PLUGIN @@ -678,7 +743,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase& other) { #ifdef EIGEN_NO_AUTOMATIC_RESIZING @@ -705,10 +770,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \internal */ - // aliasing is dealt once in internall::call_assignment + // aliasing is dealt once in internal::call_assignment // so at this stage we have to assume aliasing... and resising has to be done later. template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set(const DenseBase& other) { internal::call_assignment(this->derived(), other.derived()); @@ -721,7 +786,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \sa operator=(const MatrixBase&), _set() */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase& other) { // I don't think we need this resize call since the lazyAssign will anyways resize @@ -744,18 +809,18 @@ class PlainObjectBase : public internal::dense_xpr_base::type FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(rows,cols); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if::type* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) m_storage.data()[0] = Scalar(val0); m_storage.data()[1] = Scalar(val1); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1, typename internal::enable_if< (!internal::is_same::value) && (internal::is_same::value) @@ -781,8 +846,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(size); } - - // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) + + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) @@ -790,7 +855,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) m_storage.data()[0] = val0; } - + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type) template EIGEN_DEVICE_FUNC @@ -846,7 +911,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { this->derived() = r; } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -858,7 +923,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -872,38 +937,38 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + template friend struct internal::matrix_swap_impl; public: - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal * \brief Override DenseBase::swap() since for dynamic-sized matrices * of same type it is enough to swap the data pointers. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase & other) { enum { SwapPointers = internal::is_same::value && Base::SizeAtCompileTime==Dynamic }; internal::matrix_swap_impl::run(this->derived(), other.derived()); } - + /** \internal * \brief const version forwarded to DenseBase::swap */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase const & other) { Base::swap(other.derived()); } - - EIGEN_DEVICE_FUNC + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void _check_template_params() { - EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor) - && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0) + EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor) + && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0) && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0)) && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0)) && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0)) @@ -915,6 +980,17 @@ class PlainObjectBase : public internal::dense_xpr_base::type } enum { IsPlainObjectBase = 1 }; +#endif + public: + // These apparently need to be down here for nvcc+icc to prevent duplicate + // Map symbol. + template friend class Eigen::Map; + friend class Eigen::Map; + friend class Eigen::Map; +#if EIGEN_MAX_ALIGN_BYTES>0 + // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice. + friend class Eigen::Map; + friend class Eigen::Map; #endif }; @@ -923,13 +999,19 @@ namespace internal { template struct conservative_resize_like_impl { + #if EIGEN_HAS_TYPE_TRAITS + static const bool IsRelocatable = std::is_trivially_copyable::value; + #else + static const bool IsRelocatable = !NumTraits::RequireInitialization; + #endif static void run(DenseBase& _this, Index rows, Index cols) { if (_this.rows() == rows && _this.cols() == cols) return; EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) - if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns + if ( IsRelocatable + && (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns { internal::check_rows_cols_for_overflow::run(rows, cols); _this.derived().m_storage.conservativeResize(rows*cols,rows,cols); @@ -937,7 +1019,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(rows,cols); + Derived tmp(rows,cols); const Index common_rows = numext::mini(rows, _this.rows()); const Index common_cols = numext::mini(cols, _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -957,8 +1039,9 @@ struct conservative_resize_like_impl EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) - if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns + if ( IsRelocatable && + (( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns { const Index new_rows = other.rows() - _this.rows(); const Index new_cols = other.cols() - _this.cols(); @@ -971,7 +1054,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(other); + Derived tmp(other); const Index common_rows = numext::mini(tmp.rows(), _this.rows()); const Index common_cols = numext::mini(tmp.cols(), _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -986,13 +1069,18 @@ template struct conservative_resize_like_impl : conservative_resize_like_impl { - using conservative_resize_like_impl::run; - + typedef conservative_resize_like_impl Base; + using Base::run; + using Base::IsRelocatable; + static void run(DenseBase& _this, Index size) { const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size; const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1; - _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); } static void run(DenseBase& _this, const DenseBase& other) @@ -1003,7 +1091,10 @@ struct conservative_resize_like_impl const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows(); const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1; - _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); if (num_new_elements > 0) _this.tail(num_new_elements) = other.tail(num_new_elements); @@ -1014,7 +1105,7 @@ template struct matrix_swap_impl { EIGEN_DEVICE_FUNC - static inline void run(MatrixTypeA& a, MatrixTypeB& b) + static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b) { a.base().swap(b); } diff --git a/inst/include/Eigen/src/Core/Product.h b/inst/include/Eigen/src/Core/Product.h index 676c4802..70a6c106 100644 --- a/inst/include/Eigen/src/Core/Product.h +++ b/inst/include/Eigen/src/Core/Product.h @@ -23,25 +23,25 @@ struct traits > typedef typename remove_all::type RhsCleaned; typedef traits LhsTraits; typedef traits RhsTraits; - + typedef MatrixXpr XprKind; - + typedef typename ScalarBinaryOpTraits::Scalar, typename traits::Scalar>::ReturnType Scalar; typedef typename product_promote_storage_type::ret>::ret StorageKind; typedef typename promote_index_type::type StorageIndex; - + enum { RowsAtCompileTime = LhsTraits::RowsAtCompileTime, ColsAtCompileTime = RhsTraits::ColsAtCompileTime, MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime, MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime, - + // FIXME: only needed by GeneralMatrixMatrixTriangular InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime), - + // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator. Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 @@ -74,10 +74,10 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, internal::product_type<_Lhs,_Rhs>::ret>::ret> { public: - + typedef _Lhs Lhs; typedef _Rhs Rhs; - + typedef typename ProductImpl< Lhs, Rhs, Option, typename internal::product_promote_storage_type::StorageKind, @@ -90,18 +90,23 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; - EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) { eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } - EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } - EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const LhsNestedCleaned& lhs() const { return m_lhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const RhsNestedCleaned& rhs() const { return m_rhs; } protected: @@ -110,13 +115,13 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, }; namespace internal { - + template::ret> class dense_product_base : public internal::dense_xpr_base >::type {}; -/** Convertion to scalar for inner-products */ +/** Conversion to scalar for inner-products */ template class dense_product_base : public internal::dense_xpr_base >::type @@ -126,8 +131,8 @@ class dense_product_base public: using Base::derived; typedef typename Base::Scalar Scalar; - - EIGEN_STRONG_INLINE operator const Scalar() const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } @@ -148,25 +153,25 @@ class ProductImpl : public internal::dense_product_base { typedef Product Derived; - + public: - + typedef typename internal::dense_product_base Base; EIGEN_DENSE_PUBLIC_INTERFACE(Derived) protected: enum { - IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && + IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic), EnableCoeff = IsOneByOne || Option==LazyProduct }; - + public: - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); - + return internal::evaluator(derived()).coeff(row,col); } @@ -174,11 +179,11 @@ class ProductImpl { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); - + return internal::evaluator(derived()).coeff(i); } - - + + }; } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/ProductEvaluators.h b/inst/include/Eigen/src/Core/ProductEvaluators.h index bce1310c..8cf294b2 100644 --- a/inst/include/Eigen/src/Core/ProductEvaluators.h +++ b/inst/include/Eigen/src/Core/ProductEvaluators.h @@ -14,27 +14,27 @@ #define EIGEN_PRODUCTEVALUATORS_H namespace Eigen { - + namespace internal { /** \internal * Evaluator of a product expression. * Since products require special treatments to handle all possible cases, - * we simply deffer the evaluation logic to a product_evaluator class + * we simply defer the evaluation logic to a product_evaluator class * which offers more partial specialization possibilities. - * + * * \sa class product_evaluator */ template -struct evaluator > +struct evaluator > : public product_evaluator > { typedef Product XprType; typedef product_evaluator Base; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {} }; - + // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" // TODO we should apply that rule only if that's really helpful template @@ -62,12 +62,12 @@ struct evaluator, template -struct evaluator, DiagIndex> > +struct evaluator, DiagIndex> > : public evaluator, DiagIndex> > { typedef Diagonal, DiagIndex> XprType; typedef evaluator, DiagIndex> > Base; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(Diagonal, DiagIndex>( Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), @@ -108,27 +108,27 @@ struct product_evaluator, ProductTag, LhsShape, RhsSh : m_result(xpr.rows(), xpr.cols()) { ::new (static_cast(this)) Base(m_result); - + // FIXME shall we handle nested_eval here?, // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.) // typedef typename internal::nested_eval::type LhsNested; // typedef typename internal::nested_eval::type RhsNested; // typedef typename internal::remove_all::type LhsNestedCleaned; // typedef typename internal::remove_all::type RhsNestedCleaned; -// +// // const LhsNested lhs(xpr.lhs()); // const RhsNested rhs(xpr.rhs()); -// +// // generic_product_impl::evalTo(m_result, lhs, rhs); generic_product_impl::evalTo(m_result, xpr.lhs(), xpr.rhs()); } - -protected: + +protected: PlainObject m_result; }; -// The following three shortcuts are enabled only if the scalar types match excatly. +// The following three shortcuts are enabled only if the scalar types match exactly. // TODO: we could enable them for different scalar types when the product is not vectorized. // Dense = Product @@ -137,7 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -155,7 +155,7 @@ struct Assignment, internal::add_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -170,7 +170,7 @@ struct Assignment, internal::sub_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -190,7 +190,7 @@ struct Assignment, const CwiseNullaryOp,Plain>, const Product > SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) { call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); @@ -217,7 +217,7 @@ template - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) { call_assignment_no_alias(dst, src.lhs(), Func1()); @@ -246,19 +246,19 @@ template struct generic_product_impl { template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } - + template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } - + template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -269,10 +269,10 @@ struct generic_product_impl // Column major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); - typename nested_eval::type actual_lhs(lhs); + ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -282,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const // Row major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); - typename nested_eval::type actual_rhs(rhs); + ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); @@ -298,43 +298,43 @@ struct generic_product_impl { template struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; typedef typename Product::Scalar Scalar; - + // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose - struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; - struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; - struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; + struct set { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; + struct add { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; + struct sub { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; struct adds { Scalar m_scale; explicit adds(const Scalar& s) : m_scale(s) {} - template void operator()(const Dst& dst, const Src& src) const { + template void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += m_scale * src; } }; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } - + template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } - + template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } - + template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } - + }; @@ -343,21 +343,21 @@ template struct generic_product_impl_base { typedef typename Product::Scalar Scalar; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } - + template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); } }; @@ -373,8 +373,13 @@ struct generic_product_impl typedef typename internal::remove_all::type>::type MatrixType; template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { + // Fallback to inner product if both the lhs and rhs is a runtime vector. + if (lhs.rows() == 1 && rhs.cols() == 1) { + dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0)); + return; + } LhsNested actual_lhs(lhs); RhsNested actual_rhs(rhs); internal::gemv_dense_selector }; template -struct generic_product_impl +struct generic_product_impl { typedef typename Product::Scalar Scalar; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side @@ -398,48 +403,71 @@ struct generic_product_impl } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() += lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op()); } - + template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } - // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: - // dst {,+,-}= s * (A.lazyProduct(B)) - // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. - // For them, this strategy is also faster than simply by-passing the heap allocation through - // stack allocation. - // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, - // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, - // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h - template + // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance: + // dst {,+,-}= (s1*A)*(B*s2) + // will be rewritten as: + // dst {,+,-}= (s1*s2) * (A.lazyProduct(B)) + // There are at least four benefits of doing so: + // 1 - huge performance gain for heap-allocated matrix types as it save costly allocations. + // 2 - it is faster than simply by-passing the heap allocation through stack allocation. + // 3 - it makes this fallback consistent with the heavy GEMM routine. + // 4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices. + // (see https://stackoverflow.com/questions/54738495) + // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently + // enabled only when falling back from the main GEMM. + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic(Dst& dst, const CwiseBinaryOp, - const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func) { - call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + enum { + HasScalarFactor = blas_traits::HasScalarFactor || blas_traits::HasScalarFactor, + ConjLhs = blas_traits::NeedToConjugate, + ConjRhs = blas_traits::NeedToConjugate + }; + // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto + // this is important for real*complex_mat + Scalar actualAlpha = combine_scalar_factors(lhs, rhs); + + eval_dynamic_impl(dst, + blas_traits::extract(lhs).template conjugateIf(), + blas_traits::extract(rhs).template conjugateIf(), + func, + actualAlpha, + typename conditional::type()); } - // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above - // overload more specialized. - template +protected: + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s /* == 1 */, false_type) + { + EIGEN_UNUSED_VARIABLE(s); + eigen_internal_assert(s==Scalar(1)); + call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type) { - call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func); } - - -// template -// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) -// { dst.noalias() += alpha * lhs.lazyProduct(rhs); } }; // This specialization enforces the use of a coefficient-based evaluation strategy @@ -497,7 +525,7 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename internal::nested_eval::type LhsNested; typedef typename internal::nested_eval::type RhsNested; - + typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; @@ -516,19 +544,19 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename find_best_packet::type RhsVecPacketType; enum { - + LhsCoeffReadCost = LhsEtorType::CoeffReadCost, RhsCoeffReadCost = RhsEtorType::CoeffReadCost, CoeffReadCost = InnerSize==0 ? NumTraits::ReadCost : InnerSize == Dynamic ? HugeCost - : InnerSize * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + : InnerSize * (NumTraits::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost)) + (InnerSize - 1) * NumTraits::AddCost, Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, - + LhsFlags = LhsEtorType::Flags, RhsFlags = RhsEtorType::Flags, - + LhsRowMajor = LhsFlags & RowMajorBit, RhsRowMajor = RhsFlags & RowMajorBit, @@ -538,7 +566,7 @@ struct product_evaluator, ProductTag, DenseShape, // Here, we don't care about alignment larger than the usable packet size. LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), - + SameType = is_same::value, CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1), @@ -548,12 +576,12 @@ struct product_evaluator, ProductTag, DenseShape, : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 : (bool(RhsRowMajor) && !CanVectorizeLhs), - Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) + Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) | (EvalToRowMajor ? RowMajorBit : 0) // TODO enable vectorization for mixed types | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0), - + LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)), RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)), @@ -569,10 +597,10 @@ struct product_evaluator, ProductTag, DenseShape, CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) - && (LhsFlags & RhsFlags & ActualPacketAccessBit) - && (InnerSize % packet_traits::size == 0) + && (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit) + && (int(InnerSize) % packet_traits::size == 0) }; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const { return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum(); @@ -582,7 +610,8 @@ struct product_evaluator, ProductTag, DenseShape, * which is why we don't set the LinearAccessBit. * TODO: this seems possible when the result is a vector */ - EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const CoeffReturnType coeff(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0; @@ -590,6 +619,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const { PacketType res; @@ -601,6 +631,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; @@ -611,7 +642,7 @@ struct product_evaluator, ProductTag, DenseShape, protected: typename internal::add_const_on_value_type::type m_lhs; typename internal::add_const_on_value_type::type m_rhs; - + LhsEtorType m_lhsImpl; RhsEtorType m_rhsImpl; @@ -629,7 +660,8 @@ struct product_evaluator, LazyCoeffBasedProduc enum { Flags = Base::Flags | EvalBeforeNestingBit }; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : Base(BaseProduct(xpr.lhs(),xpr.rhs())) {} }; @@ -641,7 +673,7 @@ struct product_evaluator, LazyCoeffBasedProduc template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); res = pmadd(pset1(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet(Index(UnrollingIndex-1), col), res); @@ -651,7 +683,7 @@ struct etor_product_packet_impl struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); res = pmadd(lhs.template packet(row, Index(UnrollingIndex-1)), pset1(rhs.coeff(Index(UnrollingIndex-1), col)), res); @@ -661,7 +693,7 @@ struct etor_product_packet_impl struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { res = pmul(pset1(lhs.coeff(row, Index(0))),rhs.template packet(Index(0), col)); } @@ -670,7 +702,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { res = pmul(lhs.template packet(row, Index(0)), pset1(rhs.coeff(Index(0), col))); } @@ -679,7 +711,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) { res = pset1(typename unpacket_traits::type(0)); } @@ -688,7 +720,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) { res = pset1(typename unpacket_traits::type(0)); } @@ -697,7 +729,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { res = pset1(typename unpacket_traits::type(0)); for(Index i = 0; i < innerDim; ++i) @@ -708,7 +740,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { res = pset1(typename unpacket_traits::type(0)); for(Index i = 0; i < innerDim; ++i) @@ -730,7 +762,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -744,7 +776,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -765,9 +797,10 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template - static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); } @@ -778,7 +811,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -790,7 +823,7 @@ struct generic_product_impl /*************************************************************************** * Diagonal products ***************************************************************************/ - + template struct diagonal_product_evaluator_base : evaluator_base @@ -798,17 +831,25 @@ struct diagonal_product_evaluator_base typedef typename ScalarBinaryOpTraits::ReturnType Scalar; public: enum { - CoeffReadCost = NumTraits::MulCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost, - + CoeffReadCost = int(NumTraits::MulCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost), + MatrixFlags = evaluator::Flags, DiagFlags = evaluator::Flags, - _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + + _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor + : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor + : MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor), + _ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft) ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)), _SameTypes = is_same::value, // FIXME currently we need same types, but in the future the next rule should be the one //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))), - _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), + _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) + && _SameTypes + && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit) + && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), Alignment = evaluator::Alignment, @@ -817,14 +858,14 @@ struct diagonal_product_evaluator_base || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; - - diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) + + EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) : m_diagImpl(diag), m_matImpl(mat) { EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits::MulCost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { if(AsScalarProduct) @@ -832,7 +873,7 @@ struct diagonal_product_evaluator_base else return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } - + protected: template EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const @@ -840,7 +881,7 @@ struct diagonal_product_evaluator_base return internal::pmul(m_matImpl.template packet(row, col), internal::pset1(m_diagImpl.coeff(id))); } - + template EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const { @@ -851,7 +892,7 @@ struct diagonal_product_evaluator_base return internal::pmul(m_matImpl.template packet(row, col), m_diagImpl.template packet(id)); } - + evaluator m_diagImpl; evaluator m_matImpl; }; @@ -866,25 +907,25 @@ struct product_evaluator, ProductTag, DiagonalSha using Base::m_matImpl; using Base::coeff; typedef typename Base::Scalar Scalar; - + typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - - enum { - StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor - }; + typedef typename Lhs::DiagonalVectorType DiagonalType; + + + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const { return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } - -#ifndef __CUDACC__ + +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -893,7 +934,7 @@ struct product_evaluator, ProductTag, DiagonalSha return this->template packet_impl(row,col, row, typename internal::conditional::type()); } - + template EIGEN_STRONG_INLINE PacketType packet(Index idx) const { @@ -912,30 +953,30 @@ struct product_evaluator, ProductTag, DenseShape, using Base::m_matImpl; using Base::coeff; typedef typename Base::Scalar Scalar; - + typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - - enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor }; + + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const { return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } - -#ifndef __CUDACC__ + +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return this->template packet_impl(row,col, col, typename internal::conditional::type()); } - + template EIGEN_STRONG_INLINE PacketType packet(Index idx) const { @@ -963,7 +1004,7 @@ struct permutation_matrix_product typedef typename remove_all::type MatrixTypeCleaned; template - static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { MatrixType mat(xpr); const Index n = Side==OnTheLeft ? mat.rows() : mat.cols(); @@ -1017,7 +1058,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, lhs, rhs); } @@ -1027,7 +1068,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, rhs, lhs); } @@ -1037,7 +1078,7 @@ template struct generic_product_impl, Rhs, PermutationShape, MatrixShape, ProductTag> { template - static void evalTo(Dest& dst, const Inverse& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, lhs.nestedExpression(), rhs); } @@ -1047,7 +1088,7 @@ template struct generic_product_impl, MatrixShape, PermutationShape, ProductTag> { template - static void evalTo(Dest& dst, const Lhs& lhs, const Inverse& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse& rhs) { permutation_matrix_product::run(dst, rhs.nestedExpression(), lhs); } @@ -1069,9 +1110,9 @@ struct transposition_matrix_product { typedef typename nested_eval::type MatrixType; typedef typename remove_all::type MatrixTypeCleaned; - + template - static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) { MatrixType mat(xpr); typedef typename TranspositionType::StorageIndex StorageIndex; @@ -1094,7 +1135,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, lhs, rhs); } @@ -1104,7 +1145,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, rhs, lhs); } @@ -1115,7 +1156,7 @@ template struct generic_product_impl, Rhs, TranspositionsShape, MatrixShape, ProductTag> { template - static void evalTo(Dest& dst, const Transpose& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, lhs.nestedExpression(), rhs); } @@ -1125,7 +1166,7 @@ template struct generic_product_impl, MatrixShape, TranspositionsShape, ProductTag> { template - static void evalTo(Dest& dst, const Lhs& lhs, const Transpose& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose& rhs) { transposition_matrix_product::run(dst, rhs.nestedExpression(), lhs); } diff --git a/inst/include/Eigen/src/Core/Random.h b/inst/include/Eigen/src/Core/Random.h index 6faf789c..dab2ac8e 100644 --- a/inst/include/Eigen/src/Core/Random.h +++ b/inst/include/Eigen/src/Core/Random.h @@ -128,7 +128,7 @@ DenseBase::Random() * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) */ template -inline Derived& DenseBase::setRandom() +EIGEN_DEVICE_FUNC inline Derived& DenseBase::setRandom() { return *this = Random(rows(), cols()); } @@ -177,6 +177,42 @@ PlainObjectBase::setRandom(Index rows, Index cols) return setRandom(); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to random values. For the parameter of type + * NoChange_t, just pass the special value \c NoChange. + * + * Numbers are uniformly spread through their whole definition range for integer types, + * and in the [-1:1] range for floating point scalar types. + * + * \not_reentrant + * + * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random() + */ +template +EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setRandom(NoChange_t, Index cols) +{ + return setRandom(rows(), cols); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to random values. For the parameter of type + * NoChange_t, just pass the special value \c NoChange. + * + * Numbers are uniformly spread through their whole definition range for integer types, + * and in the [-1:1] range for floating point scalar types. + * + * \not_reentrant + * + * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random() + */ +template +EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setRandom(Index rows, NoChange_t) +{ + return setRandom(rows, cols()); +} + } // end namespace Eigen #endif // EIGEN_RANDOM_H diff --git a/inst/include/Eigen/src/Core/Redux.h b/inst/include/Eigen/src/Core/Redux.h index 760e9f86..b6790d11 100644 --- a/inst/include/Eigen/src/Core/Redux.h +++ b/inst/include/Eigen/src/Core/Redux.h @@ -23,23 +23,29 @@ namespace internal { * Part 1 : the logic deciding a strategy for vectorization and unrolling ***************************************************************************/ -template +template struct redux_traits { public: - typedef typename find_best_packet::type PacketType; + typedef typename find_best_packet::type PacketType; enum { PacketSize = unpacket_traits::size, - InnerMaxSize = int(Derived::IsRowMajor) - ? Derived::MaxColsAtCompileTime - : Derived::MaxRowsAtCompileTime + InnerMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxColsAtCompileTime + : Evaluator::MaxRowsAtCompileTime, + OuterMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxRowsAtCompileTime + : Evaluator::MaxColsAtCompileTime, + SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic + : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0) + : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize) }; enum { - MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) + MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) && (functor_traits::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), - MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize + MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), + MaySliceVectorize = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3) }; public: @@ -51,8 +57,8 @@ struct redux_traits public: enum { - Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost - : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, + Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost + : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -64,18 +70,20 @@ struct redux_traits #ifdef EIGEN_DEBUG_ASSIGN static void debug() { - std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; + std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl; std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Derived::Flags) + EIGEN_DEBUG_VAR(Evaluator::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) + EIGEN_DEBUG_VAR(OuterMaxSize) + EIGEN_DEBUG_VAR(SliceVectorizedWork) EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize) - EIGEN_DEBUG_VAR(Traversal) + std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(UnrollingLimit) - EIGEN_DEBUG_VAR(Unrolling) + std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl; std::cerr << std::endl; } #endif @@ -87,88 +95,86 @@ struct redux_traits /*** no vectorization ***/ -template +template struct redux_novec_unroller { enum { HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func) { - return func(redux_novec_unroller::run(mat,func), - redux_novec_unroller::run(mat,func)); + return func(redux_novec_unroller::run(eval,func), + redux_novec_unroller::run(eval,func)); } }; -template -struct redux_novec_unroller +template +struct redux_novec_unroller { enum { - outer = Start / Derived::InnerSizeAtCompileTime, - inner = Start % Derived::InnerSizeAtCompileTime + outer = Start / Evaluator::InnerSizeAtCompileTime, + inner = Start % Evaluator::InnerSizeAtCompileTime }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&) { - return mat.coeffByOuterInner(outer, inner); + return eval.coeffByOuterInner(outer, inner); } }; // This is actually dead code and will never be called. It is required // to prevent false warnings regarding failed inlining though // for 0 length run() will never be called at all. -template -struct redux_novec_unroller +template +struct redux_novec_unroller { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } + static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); } }; /*** vectorization ***/ -template +template struct redux_vec_unroller { - enum { - PacketSize = redux_traits::PacketSize, - HalfLength = Length/2 - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func) { + enum { + PacketSize = unpacket_traits::size, + HalfLength = Length/2 + }; + return func.packetOp( - redux_vec_unroller::run(mat,func), - redux_vec_unroller::run(mat,func) ); + redux_vec_unroller::template run(eval,func), + redux_vec_unroller::template run(eval,func) ); } }; -template -struct redux_vec_unroller +template +struct redux_vec_unroller { - enum { - index = Start * redux_traits::PacketSize, - outer = index / int(Derived::InnerSizeAtCompileTime), - inner = index % int(Derived::InnerSizeAtCompileTime), - alignment = Derived::Alignment - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&) { - return mat.template packetByOuterInner(outer, inner); + enum { + PacketSize = unpacket_traits::size, + index = Start * PacketSize, + outer = index / int(Evaluator::InnerSizeAtCompileTime), + inner = index % int(Evaluator::InnerSizeAtCompileTime), + alignment = Evaluator::Alignment + }; + return eval.template packetByOuterInner(outer, inner); } }; @@ -176,53 +182,65 @@ struct redux_vec_unroller * Part 3 : implementation of all cases ***************************************************************************/ -template::Traversal, - int Unrolling = redux_traits::Unrolling +template::Traversal, + int Unrolling = redux_traits::Unrolling > struct redux_impl; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); Scalar res; - res = mat.coeffByOuterInner(0, 0); - for(Index i = 1; i < mat.innerSize(); ++i) - res = func(res, mat.coeffByOuterInner(0, i)); - for(Index i = 1; i < mat.outerSize(); ++i) - for(Index j = 0; j < mat.innerSize(); ++j) - res = func(res, mat.coeffByOuterInner(i, j)); + res = eval.coeffByOuterInner(0, 0); + for(Index i = 1; i < xpr.innerSize(); ++i) + res = func(res, eval.coeffByOuterInner(0, i)); + for(Index i = 1; i < xpr.outerSize(); ++i) + for(Index j = 0; j < xpr.innerSize(); ++j) + res = func(res, eval.coeffByOuterInner(i, j)); return res; } }; -template -struct redux_impl - : public redux_novec_unroller -{}; +template +struct redux_impl + : redux_novec_unroller +{ + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/) + { + return Base::run(eval,func); + } +}; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - static Scalar run(const Derived &mat, const Func& func) + template + static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - const Index size = mat.size(); + const Index size = xpr.size(); - const Index packetSize = redux_traits::PacketSize; + const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) + alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) }; - const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); + const Index alignedStart = internal::first_default_aligned(xpr); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -230,34 +248,34 @@ struct redux_impl Scalar res; if(alignedSize) { - PacketScalar packet_res0 = mat.template packet(alignedStart); + PacketScalar packet_res0 = eval.template packet(alignedStart); if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop { - PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); + PacketScalar packet_res1 = eval.template packet(alignedStart+packetSize); for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) { - packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); - packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(index)); + packet_res1 = func.packetOp(packet_res1, eval.template packet(index+packetSize)); } packet_res0 = func.packetOp(packet_res0,packet_res1); if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(alignedEnd2)); } res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); for(Index index = alignedEnd; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = mat.coeff(0); + res = eval.coeff(0); for(Index index = 1; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } return res; @@ -265,130 +283,108 @@ struct redux_impl }; // NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketType; - EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) + template + EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - const Index innerSize = mat.innerSize(); - const Index outerSize = mat.outerSize(); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + const Index innerSize = xpr.innerSize(); + const Index outerSize = xpr.outerSize(); enum { - packetSize = redux_traits::PacketSize + packetSize = redux_traits::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; if(packetedInnerSize) { - PacketType packet_res = mat.template packet(0,0); + PacketType packet_res = eval.template packet(0,0); for(Index j=0; j(j,i)); + packet_res = func.packetOp(packet_res, eval.template packetByOuterInner(j,i)); res = func.predux(packet_res); for(Index j=0; j::run(mat, func); + res = redux_impl::run(eval, func, xpr); } return res; } }; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename redux_traits::PacketType PacketType; enum { - PacketSize = redux_traits::PacketSize, - Size = Derived::SizeAtCompileTime, - VectorizedSize = (Size / PacketSize) * PacketSize + PacketSize = redux_traits::PacketSize, + Size = Evaluator::SizeAtCompileTime, + VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize) }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + EIGEN_ONLY_USED_FOR_DEBUG(xpr) + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + Scalar res = func.predux(redux_vec_unroller::template run(eval,func)); if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); + res = func(res,redux_novec_unroller::run(eval,func)); return res; } else { - return redux_novec_unroller::run(mat,func); + return redux_novec_unroller::run(eval,func); } } }; // evaluator adaptor template -class redux_evaluator +class redux_evaluator : public internal::evaluator<_XprType> { + typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; enum { MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = evaluator::Flags & ~DirectAccessBit, + Flags = Base::Flags & ~DirectAccessBit, IsRowMajor = XprType::IsRowMajor, SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, - CoeffReadCost = evaluator::CoeffReadCost, - Alignment = evaluator::Alignment + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index row, Index col) const - { return m_evaluator.coeff(row, col); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index index) const - { return m_evaluator.coeff(index); } - - template - PacketType packet(Index row, Index col) const - { return m_evaluator.template packet(row, col); } - - template - PacketType packet(Index index) const - { return m_evaluator.template packet(index); } - - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const - { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - const XprType & nestedExpression() const { return m_xpr; } - -protected: - internal::evaluator m_evaluator; - const XprType &m_xpr; }; } // end namespace internal @@ -403,39 +399,53 @@ class redux_evaluator * The template parameter \a BinaryOp is the type of the functor \a func which must be * an associative operator. Both current C++98 and C++11 functor styles are handled. * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() */ template template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); typedef typename internal::redux_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - - return internal::redux_impl::run(thisEval, func); + + // The initial expression is passed to the reducer as an additional argument instead of + // passing it as a member of redux_evaluator to help + return internal::redux_impl::run(thisEval, func, derived()); } /** \returns the minimum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::minCoeff() const { - return derived().redux(Eigen::internal::scalar_min_op()); + return derived().redux(Eigen::internal::scalar_min_op()); } -/** \returns the maximum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. +/** \returns the maximum of all coefficients of \c *this. + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::maxCoeff() const { - return derived().redux(Eigen::internal::scalar_max_op()); + return derived().redux(Eigen::internal::scalar_max_op()); } /** \returns the sum of all coefficients of \c *this @@ -445,7 +455,7 @@ DenseBase::maxCoeff() const * \sa trace(), prod(), mean() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::sum() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -458,7 +468,7 @@ DenseBase::sum() const * \sa trace(), prod(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::mean() const { #ifdef __INTEL_COMPILER @@ -479,7 +489,7 @@ DenseBase::mean() const * \sa sum(), mean(), trace() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::prod() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -494,7 +504,7 @@ DenseBase::prod() const * \sa diagonal(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar MatrixBase::trace() const { return derived().diagonal().sum(); diff --git a/inst/include/Eigen/src/Core/Ref.h b/inst/include/Eigen/src/Core/Ref.h index 17a1496b..c2a37ead 100644 --- a/inst/include/Eigen/src/Core/Ref.h +++ b/inst/include/Eigen/src/Core/Ref.h @@ -10,7 +10,7 @@ #ifndef EIGEN_REF_H #define EIGEN_REF_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -48,7 +48,7 @@ struct traits > }; typedef typename internal::conditional::type type; }; - + }; template @@ -67,12 +67,12 @@ template class RefBase typedef MapBase Base; EIGEN_DENSE_PUBLIC_INTERFACE(RefBase) - EIGEN_DEVICE_FUNC inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() : IsVectorAtCompileTime ? this->size() @@ -86,36 +86,122 @@ template class RefBase m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime) {} - + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase) protected: typedef Stride StrideBase; + // Resolves inner stride if default 0. + static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) { + return inner == 0 ? 1 : inner; + } + + // Resolves outer stride if default 0. + static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) { + return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer; + } + + // Returns true if construction is valid, false if there is a stride mismatch, + // and fails if there is a size mismatch. template - EIGEN_DEVICE_FUNC void construct(Expression& expr) + EIGEN_DEVICE_FUNC bool construct(Expression& expr) { - EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression); - + // Check matrix sizes. If this is a compile-time vector, we do allow + // implicitly transposing. + EIGEN_STATIC_ASSERT( + EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression) + // If it is a vector, the transpose sizes might match. + || ( PlainObjectType::IsVectorAtCompileTime + && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic + || int(Expression::ColsAtCompileTime)==Eigen::Dynamic + || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime)) + && (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic + || int(Expression::RowsAtCompileTime)==Eigen::Dynamic + || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))), + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES + ) + + // Determine runtime rows and columns. + Index rows = expr.rows(); + Index cols = expr.cols(); if(PlainObjectType::RowsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); - ::new (static_cast(this)) Base(expr.data(), 1, expr.size()); + rows = 1; + cols = expr.size(); } else if(PlainObjectType::ColsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); - ::new (static_cast(this)) Base(expr.data(), expr.size(), 1); + rows = expr.size(); + cols = 1; + } + // Verify that the sizes are valid. + eigen_assert( + (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows)); + eigen_assert( + (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols)); + + + // If this is a vector, we might be transposing, which means that stride should swap. + const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows()); + // If the storage format differs, we also need to swap the stride. + const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0; + const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0; + const bool storage_differs = (row_major != expr_row_major); + + const bool swap_stride = (transpose != storage_differs); + + // Determine expr's actual strides, resolving any defaults if zero. + const Index expr_inner_actual = resolveInnerStride(expr.innerStride()); + const Index expr_outer_actual = resolveOuterStride(expr_inner_actual, + expr.outerStride(), + expr.rows(), + expr.cols(), + Expression::IsVectorAtCompileTime != 0, + expr_row_major); + + // If this is a column-major row vector or row-major column vector, the inner-stride + // is arbitrary, so set it to either the compile-time inner stride or 1. + const bool row_vector = (rows == 1); + const bool col_vector = (cols == 1); + const Index inner_stride = + ( (!row_major && row_vector) || (row_major && col_vector) ) ? + ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1) + : swap_stride ? expr_outer_actual : expr_inner_actual; + + // If this is a column-major column vector or row-major row vector, the outer-stride + // is arbitrary, so set it to either the compile-time outer stride or vector size. + const Index outer_stride = + ( (!row_major && col_vector) || (row_major && row_vector) ) ? + ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride) + : swap_stride ? expr_inner_actual : expr_outer_actual; + + // Check if given inner/outer strides are compatible with compile-time strides. + const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic) + || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride); + if (!inner_valid) { + return false; } - else - ::new (static_cast(this)) Base(expr.data(), expr.rows(), expr.cols()); - - if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit))) - ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1); - else - ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), - StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); + + const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic) + || (resolveOuterStride( + inner_stride, + Index(StrideType::OuterStrideAtCompileTime), + rows, cols, PlainObjectType::IsVectorAtCompileTime != 0, + row_major) + == outer_stride); + if (!outer_valid) { + return false; + } + + ::new (static_cast(this)) Base(expr.data(), rows, cols); + ::new (&m_stride) StrideBase( + (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride, + (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride ); + return true; } StrideBase m_stride; @@ -187,6 +273,8 @@ template class RefBase * void foo(const Ref >& A) { foo_impl(A); } * \endcode * + * See also the following stackoverflow questions for further references: + * - Correct usage of the Eigen::Ref<> class * * \sa PlainObjectBase::Map(), \ref TopicStorageOrders */ @@ -210,7 +298,10 @@ template class Ref typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) { EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); - Base::construct(expr.derived()); + // Construction must pass since we will not create temprary storage in the non-const case. + const bool success = Base::construct(expr.derived()); + EIGEN_UNUSED_VARIABLE(success) + eigen_assert(success); } template EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, @@ -224,7 +315,10 @@ template class Ref EIGEN_STATIC_ASSERT(bool(internal::is_lvalue::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); - Base::construct(expr.const_cast_derived()); + // Construction must pass since we will not create temporary storage in the non-const case. + const bool success = Base::construct(expr.const_cast_derived()); + EIGEN_UNUSED_VARIABLE(success) + eigen_assert(success); } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref) @@ -265,7 +359,10 @@ template class Ref< template EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type) { - Base::construct(expr); + // Check if we can use the underlying expr's storage directly, otherwise call the copy version. + if (!Base::construct(expr)) { + construct(expr, internal::false_type()); + } } template diff --git a/inst/include/Eigen/src/Core/Replicate.h b/inst/include/Eigen/src/Core/Replicate.h index 9960ef88..ab5be7e6 100644 --- a/inst/include/Eigen/src/Core/Replicate.h +++ b/inst/include/Eigen/src/Core/Replicate.h @@ -10,7 +10,7 @@ #ifndef EIGEN_REPLICATE_H #define EIGEN_REPLICATE_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -35,7 +35,7 @@ struct traits > IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1 : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0 : (MatrixType::Flags & RowMajorBit) ? 1 : 0, - + // FIXME enable DirectAccess with negative strides? Flags = IsRowMajor ? RowMajorBit : 0 }; @@ -88,15 +88,15 @@ template class Replicate THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); } EIGEN_DEVICE_FUNC const _MatrixTypeNested& nestedExpression() const - { - return m_matrix; + { + return m_matrix; } protected: @@ -115,7 +115,7 @@ template class Replicate */ template template -const Replicate +EIGEN_DEVICE_FUNC const Replicate DenseBase::replicate() const { return Replicate(derived()); @@ -130,7 +130,7 @@ DenseBase::replicate() const * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate */ template -const typename VectorwiseOp::ReplicateReturnType +EIGEN_DEVICE_FUNC const typename VectorwiseOp::ReplicateReturnType VectorwiseOp::replicate(Index factor) const { return typename VectorwiseOp::ReplicateReturnType diff --git a/inst/include/Eigen/src/Core/Reshaped.h b/inst/include/Eigen/src/Core/Reshaped.h new file mode 100644 index 00000000..52de73b6 --- /dev/null +++ b/inst/include/Eigen/src/Core/Reshaped.h @@ -0,0 +1,454 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2017 Gael Guennebaud +// Copyright (C) 2014 yoco +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RESHAPED_H +#define EIGEN_RESHAPED_H + +namespace Eigen { + +/** \class Reshaped + * \ingroup Core_Module + * + * \brief Expression of a fixed-size or dynamic-size reshape + * + * \tparam XprType the type of the expression in which we are taking a reshape + * \tparam Rows the number of rows of the reshape we are taking at compile time (optional) + * \tparam Cols the number of columns of the reshape we are taking at compile time (optional) + * \tparam Order can be ColMajor or RowMajor, default is ColMajor. + * + * This class represents an expression of either a fixed-size or dynamic-size reshape. + * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and + * most of the time this is the only way it is used. + * + * However, in C++98, if you want to directly maniputate reshaped expressions, + * for instance if you want to write a function returning such an expression, you + * will need to use this class. In C++11, it is advised to use the \em auto + * keyword for such use cases. + * + * Here is an example illustrating the dynamic case: + * \include class_Reshaped.cpp + * Output: \verbinclude class_Reshaped.out + * + * Here is an example illustrating the fixed-size case: + * \include class_FixedReshaped.cpp + * Output: \verbinclude class_FixedReshaped.out + * + * \sa DenseBase::reshaped(NRowsType,NColsType) + */ + +namespace internal { + +template +struct traits > : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::XprKind XprKind; + enum{ + MatrixRows = traits::RowsAtCompileTime, + MatrixCols = traits::ColsAtCompileTime, + RowsAtCompileTime = Rows, + ColsAtCompileTime = Cols, + MaxRowsAtCompileTime = Rows, + MaxColsAtCompileTime = Cols, + XpxStorageOrder = ((int(traits::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor, + ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor + : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor + : XpxStorageOrder, + HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder), + InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime), + InnerStrideAtCompileTime = HasSameStorageOrderAsXprType + ? int(inner_stride_at_compile_time::ret) + : Dynamic, + OuterStrideAtCompileTime = Dynamic, + + HasDirectAccess = internal::has_direct_access::ret + && (Order==int(XpxStorageOrder)) + && ((evaluator::Flags&LinearAccessBit)==LinearAccessBit), + + MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits::size) == 0) + && (InnerStrideAtCompileTime == 1) + ? PacketAccessBit : 0, + //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, + FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, + Flags0 = traits::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit), + + Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit) + }; +}; + +template class ReshapedImpl_dense; + +} // end namespace internal + +template class ReshapedImpl; + +template class Reshaped + : public ReshapedImpl::StorageKind> +{ + typedef ReshapedImpl::StorageKind> Impl; + public: + //typedef typename Impl::Base Base; + typedef Impl Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped) + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline Reshaped(XprType& xpr) + : Impl(xpr) + { + EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) + eigen_assert(Rows * Cols == xpr.rows() * xpr.cols()); + } + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline Reshaped(XprType& xpr, + Index reshapeRows, Index reshapeCols) + : Impl(xpr, reshapeRows, reshapeCols) + { + eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows) + && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols)); + eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols()); + } +}; + +// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense +// that must be specialized for direct and non-direct access... +template +class ReshapedImpl + : public internal::ReshapedImpl_dense >::HasDirectAccess> +{ + typedef internal::ReshapedImpl_dense >::HasDirectAccess> Impl; + public: + typedef Impl Base; + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl) + EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {} + EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols) + : Impl(xpr, reshapeRows, reshapeCols) {} +}; + +namespace internal { + +/** \internal Internal implementation of dense Reshaped in the general case. */ +template +class ReshapedImpl_dense + : public internal::dense_xpr_base >::type +{ + typedef Reshaped ReshapedType; + public: + + typedef typename internal::dense_xpr_base::type Base; + EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) + + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename internal::remove_all::type NestedExpression; + + class InnerIterator; + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr) + : m_xpr(xpr), m_rows(Rows), m_cols(Cols) + {} + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) + : m_xpr(xpr), m_rows(nRows), m_cols(nCols) + {} + + EIGEN_DEVICE_FUNC Index rows() const { return m_rows; } + EIGEN_DEVICE_FUNC Index cols() const { return m_cols; } + + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** \sa MapBase::data() */ + EIGEN_DEVICE_FUNC inline const Scalar* data() const; + EIGEN_DEVICE_FUNC inline Index innerStride() const; + EIGEN_DEVICE_FUNC inline Index outerStride() const; + #endif + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + typename internal::remove_reference::type& + nestedExpression() { return m_xpr; } + + protected: + + MatrixTypeNested m_xpr; + const internal::variable_if_dynamic m_rows; + const internal::variable_if_dynamic m_cols; +}; + + +/** \internal Internal implementation of dense Reshaped in the direct access case. */ +template +class ReshapedImpl_dense + : public MapBase > +{ + typedef Reshaped ReshapedType; + typedef typename internal::ref_selector::non_const_type XprTypeNested; + public: + + typedef MapBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr) + : Base(xpr.data()), m_xpr(xpr) + {} + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) + : Base(xpr.data(), nRows, nCols), + m_xpr(xpr) + {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& nestedExpression() const + { + return m_xpr; + } + + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } + + /** \sa MapBase::innerStride() */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const + { + return m_xpr.innerStride(); + } + + /** \sa MapBase::outerStride() */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const + { + return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); + } + + protected: + + XprTypeNested m_xpr; +}; + +// Evaluators +template struct reshaped_evaluator; + +template +struct evaluator > + : reshaped_evaluator >::HasDirectAccess> +{ + typedef Reshaped XprType; + typedef typename XprType::Scalar Scalar; + // TODO: should check for smaller packet types + typedef typename packet_traits::type PacketScalar; + + enum { + CoeffReadCost = evaluator::CoeffReadCost, + HasDirectAccess = traits::HasDirectAccess, + +// RowsAtCompileTime = traits::RowsAtCompileTime, +// ColsAtCompileTime = traits::ColsAtCompileTime, +// MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, +// MaxColsAtCompileTime = traits::MaxColsAtCompileTime, +// +// InnerStrideAtCompileTime = traits::HasSameStorageOrderAsXprType +// ? int(inner_stride_at_compile_time::ret) +// : Dynamic, +// OuterStrideAtCompileTime = Dynamic, + + FlagsLinearAccessBit = (traits::RowsAtCompileTime == 1 || traits::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0, + FlagsRowMajorBit = (traits::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, + FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, + Flags0 = evaluator::Flags & (HereditaryBits & ~RowMajorBit), + Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit, + + PacketAlignment = unpacket_traits::alignment, + Alignment = evaluator::Alignment + }; + typedef reshaped_evaluator reshaped_evaluator_type; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } +}; + +template +struct reshaped_evaluator + : evaluator_base > +{ + typedef Reshaped XprType; + + enum { + CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of index computations */, + + Flags = (evaluator::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)), + + Alignment = 0 + }; + + EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef std::pair RowCol; + + inline RowCol index_remap(Index rowId, Index colId) const + { + if(Order==ColMajor) + { + const Index nth_elem_idx = colId * m_xpr.rows() + rowId; + return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(), + nth_elem_idx / m_xpr.nestedExpression().rows()); + } + else + { + const Index nth_elem_idx = colId + rowId * m_xpr.cols(); + return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(), + nth_elem_idx % m_xpr.nestedExpression().cols()); + } + } + + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index rowId, Index colId) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeff(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeffRef(row_col.first, row_col.second); + + } + + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const + { + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline const CoeffReturnType coeff(Index index) const + { + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeff(row_col.first, row_col.second); + } +#if 0 + EIGEN_DEVICE_FUNC + template + inline PacketScalar packet(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.template packet(row_col.first, row_col.second); + + } + + template + EIGEN_DEVICE_FUNC + inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + { + const RowCol row_col = index_remap(rowId, colId); + m_argImpl.const_cast_derived().template writePacket + (row_col.first, row_col.second, val); + } + + template + EIGEN_DEVICE_FUNC + inline PacketScalar packet(Index index) const + { + const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); + return m_argImpl.template packet(row_col.first, row_col.second); + } + + template + EIGEN_DEVICE_FUNC + inline void writePacket(Index index, const PacketScalar& val) + { + const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); + return m_argImpl.template packet(row_col.first, row_col.second, val); + } +#endif +protected: + + evaluator m_argImpl; + const XprType& m_xpr; + +}; + +template +struct reshaped_evaluator +: mapbase_evaluator, + typename Reshaped::PlainObject> +{ + typedef Reshaped XprType; + typedef typename XprType::Scalar Scalar; + + EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) + : mapbase_evaluator(xpr) + { + // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime + eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_RESHAPED_H diff --git a/inst/include/Eigen/src/Core/ReturnByValue.h b/inst/include/Eigen/src/Core/ReturnByValue.h index c44b7673..4dad13ea 100644 --- a/inst/include/Eigen/src/Core/ReturnByValue.h +++ b/inst/include/Eigen/src/Core/ReturnByValue.h @@ -60,8 +60,10 @@ template class ReturnByValue EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { static_cast(this)->evalTo(dst); } - EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast(this)->rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast(this)->cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return static_cast(this)->rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return static_cast(this)->cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT @@ -79,7 +81,7 @@ template class ReturnByValue template template -Derived& DenseBase::operator=(const ReturnByValue& other) +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const ReturnByValue& other) { other.evalTo(derived()); return derived(); @@ -90,7 +92,7 @@ namespace internal { // Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that // when a ReturnByValue expression is assigned, the evaluator is not constructed. // TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world - + template struct evaluator > : public evaluator::ReturnType> @@ -98,7 +100,7 @@ struct evaluator > typedef ReturnByValue XprType; typedef typename internal::traits::ReturnType PlainObject; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { diff --git a/inst/include/Eigen/src/Core/Reverse.h b/inst/include/Eigen/src/Core/Reverse.h index 0640cda2..28cdd76a 100644 --- a/inst/include/Eigen/src/Core/Reverse.h +++ b/inst/include/Eigen/src/Core/Reverse.h @@ -12,7 +12,7 @@ #ifndef EIGEN_REVERSE_H #define EIGEN_REVERSE_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -44,7 +44,7 @@ template struct reverse_packet_cond static inline PacketType run(const PacketType& x) { return x; } }; -} // end namespace internal +} // end namespace internal /** \class Reverse * \ingroup Core_Module @@ -89,8 +89,10 @@ template class Reverse EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse) - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } EIGEN_DEVICE_FUNC inline Index innerStride() const { @@ -98,7 +100,7 @@ template class Reverse } EIGEN_DEVICE_FUNC const typename internal::remove_all::type& - nestedExpression() const + nestedExpression() const { return m_matrix; } @@ -114,7 +116,7 @@ template class Reverse * */ template -inline typename DenseBase::ReverseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ReverseReturnType DenseBase::reverse() { return ReverseReturnType(derived()); @@ -136,7 +138,7 @@ DenseBase::reverse() * * \sa VectorwiseOp::reverseInPlace(), reverse() */ template -inline void DenseBase::reverseInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::reverseInPlace() { if(cols()>rows()) { @@ -161,7 +163,7 @@ inline void DenseBase::reverseInPlace() } namespace internal { - + template struct vectorwise_reverse_inplace_impl; @@ -171,8 +173,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2; Index half = xpr.rows()/2; - xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse()); + xpr.topRows(fix(half)) + .swap(xpr.bottomRows(fix(half)).colwise().reverse()); } }; @@ -182,8 +186,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2; Index half = xpr.cols()/2; - xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse()); + xpr.leftCols(fix(half)) + .swap(xpr.rightCols(fix(half)).rowwise().reverse()); } }; @@ -201,9 +207,9 @@ struct vectorwise_reverse_inplace_impl * * \sa DenseBase::reverseInPlace(), reverse() */ template -void VectorwiseOp::reverseInPlace() +EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() { - internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); + internal::vectorwise_reverse_inplace_impl::run(m_matrix); } } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/Select.h b/inst/include/Eigen/src/Core/Select.h index 79eec1b5..7c86bf87 100644 --- a/inst/include/Eigen/src/Core/Select.h +++ b/inst/include/Eigen/src/Core/Select.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SELECT_H #define EIGEN_SELECT_H -namespace Eigen { +namespace Eigen { /** \class Select * \ingroup Core_Module @@ -67,8 +67,10 @@ class Select : public internal::dense_xpr_base< Select template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const DenseBase& thenMatrix, const DenseBase& elseMatrix) const { @@ -134,7 +136,7 @@ DenseBase::select(const DenseBase& thenMatrix, */ template template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const DenseBase& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const { @@ -149,7 +151,7 @@ DenseBase::select(const DenseBase& thenMatrix, */ template template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const typename ElseDerived::Scalar& thenScalar, const DenseBase& elseMatrix) const { diff --git a/inst/include/Eigen/src/Core/SelfAdjointView.h b/inst/include/Eigen/src/Core/SelfAdjointView.h index b2e51f37..8ce3b372 100644 --- a/inst/include/Eigen/src/Core/SelfAdjointView.h +++ b/inst/include/Eigen/src/Core/SelfAdjointView.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SELFADJOINTMATRIX_H #define EIGEN_SELFADJOINTMATRIX_H -namespace Eigen { +namespace Eigen { /** \class SelfAdjointView * \ingroup Core_Module @@ -58,14 +58,15 @@ template class SelfAdjointView typedef MatrixTypeNestedCleaned NestedExpression; /** \brief The type of coefficients in this matrix */ - typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::remove_all::type MatrixConjugateReturnType; + typedef SelfAdjointView::type, UpLo> ConstSelfAdjointView; enum { Mode = internal::traits::Mode, Flags = internal::traits::Flags, - TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0) + TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0) }; typedef typename MatrixType::PlainObject PlainObject; @@ -75,14 +76,14 @@ template class SelfAdjointView EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); } - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_matrix.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_matrix.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); } /** \sa MatrixBase::coeff() * \warning the coordinates must fit into the referenced triangular part @@ -131,7 +132,7 @@ template class SelfAdjointView { return Product(lhs.derived(),rhs); } - + friend EIGEN_DEVICE_FUNC const SelfAdjointView operator*(const Scalar& s, const SelfAdjointView& mat) @@ -197,6 +198,18 @@ template class SelfAdjointView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef SelfAdjointView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -287,17 +300,17 @@ class triangular_dense_assignment_kernel template -typename MatrixBase::template ConstSelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstSelfAdjointViewReturnType::Type MatrixBase::selfadjointView() const { return typename ConstSelfAdjointViewReturnType::Type(derived()); @@ -341,7 +354,7 @@ MatrixBase::selfadjointView() const */ template template -typename MatrixBase::template SelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template SelfAdjointViewReturnType::Type MatrixBase::selfadjointView() { return typename SelfAdjointViewReturnType::Type(derived()); diff --git a/inst/include/Eigen/src/Core/Solve.h b/inst/include/Eigen/src/Core/Solve.h index a8daea51..23d5cb70 100644 --- a/inst/include/Eigen/src/Core/Solve.h +++ b/inst/include/Eigen/src/Core/Solve.h @@ -13,13 +13,13 @@ namespace Eigen { template class SolveImpl; - + /** \class Solve * \ingroup Core_Module * * \brief Pseudo expression representing a solving operation * - * \tparam Decomposition the type of the matrix or decomposion object + * \tparam Decomposition the type of the matrix or decomposition object * \tparam Rhstype the type of the right-hand side * * This class represents an expression of A.solve(B) @@ -64,13 +64,13 @@ class Solve : public SolveImpl::PlainObject PlainObject; typedef typename internal::traits::StorageIndex StorageIndex; - + Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {} - - EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; } EIGEN_DEVICE_FUNC const RhsType& rhs() const { return m_rhs; } @@ -87,14 +87,14 @@ class SolveImpl : public MatrixBase > { typedef Solve Derived; - + public: - + typedef MatrixBase > Base; EIGEN_DENSE_PUBLIC_INTERFACE(Derived) private: - + Scalar coeff(Index row, Index col) const; Scalar coeff(Index i) const; }; @@ -119,15 +119,15 @@ struct evaluator > typedef evaluator Base; enum { Flags = Base::Flags | EvalBeforeNestingBit }; - + EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve) : m_result(solve.rows(), solve.cols()) { ::new (static_cast(this)) Base(m_result); solve.dec()._solve_impl(solve.rhs(), m_result); } - -protected: + +protected: PlainObject m_result; }; @@ -176,12 +176,12 @@ struct Assignment(src.rhs(), dst); } }; -} // end namepsace internal +} // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/SolveTriangular.h b/inst/include/Eigen/src/Core/SolveTriangular.h index fd0acb1a..dfbf9952 100644 --- a/inst/include/Eigen/src/Core/SolveTriangular.h +++ b/inst/include/Eigen/src/Core/SolveTriangular.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SOLVETRIANGULAR_H #define EIGEN_SOLVETRIANGULAR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -54,7 +54,7 @@ struct triangular_solver_selector typedef blas_traits LhsProductTraits; typedef typename LhsProductTraits::ExtractType ActualLhsType; typedef Map, Aligned> MappedRhs; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { ActualLhsType actualLhs = LhsProductTraits::extract(lhs); @@ -64,7 +64,7 @@ struct triangular_solver_selector ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(), (useRhsDirectly ? rhs.data() : 0)); - + if(!useRhsDirectly) MappedRhs(actualRhs,rhs.size()) = rhs; @@ -85,7 +85,7 @@ struct triangular_solver_selector typedef blas_traits LhsProductTraits; typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { typename internal::add_const_on_value_type::type actualLhs = LhsProductTraits::extract(lhs); @@ -118,7 +118,7 @@ struct triangular_solver_unroller { DiagIndex = IsLower ? LoopIndex : Size - LoopIndex - 1, StartIndex = IsLower ? 0 : DiagIndex+1 }; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { if (LoopIndex>0) rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment(StartIndex).transpose() @@ -133,22 +133,22 @@ struct triangular_solver_unroller { template struct triangular_solver_unroller { - static void run(const Lhs&, Rhs&) {} + static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {} }; template struct triangular_solver_selector { - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { triangular_solver_unroller::run(lhs,rhs); } }; template struct triangular_solver_selector { - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { Transpose trLhs(lhs); Transpose trRhs(rhs); - + triangular_solver_unroller,Transpose, ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag), 0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs); @@ -164,11 +164,11 @@ struct triangular_solver_selector { #ifndef EIGEN_PARSED_BY_DOXYGEN template template -void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const +EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); - eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower))); + eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower)))); // If solving for a 0x0 matrix, nothing to do, simply return. if (derived().cols() == 0) return; @@ -213,8 +213,8 @@ template struct triangular_solv : m_triangularMatrix(tri), m_rhs(rhs) {} - inline Index rows() const { return m_rhs.rows(); } - inline Index cols() const { return m_rhs.cols(); } + inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); } + inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } template inline void evalTo(Dest& dst) const { diff --git a/inst/include/Eigen/src/Core/SolverBase.h b/inst/include/Eigen/src/Core/SolverBase.h index 8a4adc22..50146104 100644 --- a/inst/include/Eigen/src/Core/SolverBase.h +++ b/inst/include/Eigen/src/Core/SolverBase.h @@ -14,8 +14,35 @@ namespace Eigen { namespace internal { +template +struct solve_assertion { + template + static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion(b); } +}; + +template +struct solve_assertion > +{ + typedef Transpose type; + + template + static void run(const type& transpose, const Rhs& b) + { + internal::solve_assertion::type>::template run(transpose.nestedExpression(), b); + } +}; +template +struct solve_assertion, const Transpose > > +{ + typedef CwiseUnaryOp, const Transpose > type; + template + static void run(const type& adjoint, const Rhs& b) + { + internal::solve_assertion >::type>::template run(adjoint.nestedExpression(), b); + } +}; } // end namespace internal /** \class SolverBase @@ -35,7 +62,7 @@ namespace internal { * * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors. * - * \sa class PartialPivLU, class FullPivLU + * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase */ template class SolverBase : public EigenBase @@ -46,6 +73,9 @@ class SolverBase : public EigenBase typedef typename internal::traits::Scalar Scalar; typedef Scalar CoeffReturnType; + template + friend struct internal::solve_assertion; + enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, @@ -56,7 +86,8 @@ class SolverBase : public EigenBase MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret), IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1 + || internal::traits::MaxColsAtCompileTime == 1, + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2 }; /** Default constructor */ @@ -74,7 +105,7 @@ class SolverBase : public EigenBase inline const Solve solve(const MatrixBase& b) const { - eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b"); + internal::solve_assertion::type>::template run(derived(), b); return Solve(derived(), b.derived()); } @@ -112,6 +143,13 @@ class SolverBase : public EigenBase } protected: + + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(derived().m_isInitialized && "Solver is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b"); + } }; namespace internal { diff --git a/inst/include/Eigen/src/Core/StableNorm.h b/inst/include/Eigen/src/Core/StableNorm.h index 88c8d989..4a3f0cca 100644 --- a/inst/include/Eigen/src/Core/StableNorm.h +++ b/inst/include/Eigen/src/Core/StableNorm.h @@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc ssq += (bl*invScale).squaredNorm(); } +template +void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) +{ + typedef typename VectorType::Scalar Scalar; + const Index blockSize = 4096; + + typedef typename internal::nested_eval::type VectorTypeCopy; + typedef typename internal::remove_all::type VectorTypeCopyClean; + const VectorTypeCopy copy(vec); + + enum { + CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization + }; + typedef typename internal::conditional, internal::evaluator::Alignment>, + typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper; + Index n = vec.size(); + + Index bi = internal::first_default_aligned(copy); + if (bi>0) + internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); + for (; bi +typename VectorType::RealScalar +stable_norm_impl(const VectorType &vec, typename enable_if::type* = 0 ) +{ + using std::sqrt; + using std::abs; + + Index n = vec.size(); + + if(n==1) + return abs(vec.coeff(0)); + + typedef typename VectorType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + stable_norm_impl_inner_step(vec, ssq, scale, invScale); + + return scale * sqrt(ssq); +} + +template +typename MatrixType::RealScalar +stable_norm_impl(const MatrixType &mat, typename enable_if::type* = 0 ) +{ + using std::sqrt; + + typedef typename MatrixType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + for(Index j=0; j inline typename NumTraits::Scalar>::Real blueNorm_impl(const EigenBase& _vec) @@ -58,52 +123,43 @@ blueNorm_impl(const EigenBase& _vec) using std::pow; using std::sqrt; using std::abs; + + // This program calculates the machine-dependent constants + // bl, b2, slm, s2m, relerr overfl + // from the "basic" machine-dependent numbers + // nbig, ibeta, it, iemin, iemax, rbig. + // The following define the basic machine-dependent constants. + // For portability, the PORT subprograms "ilmaeh" and "rlmach" + // are used. For any specific computer, each of the assignment + // statements can be replaced + static const int ibeta = std::numeric_limits::radix; // base for floating-point numbers + static const int it = NumTraits::digits(); // number of base-beta digits in mantissa + static const int iemin = NumTraits::min_exponent(); // minimum exponent + static const int iemax = NumTraits::max_exponent(); // maximum exponent + static const RealScalar rbig = NumTraits::highest(); // largest floating-point number + static const RealScalar b1 = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2)))); // lower boundary of midrange + static const RealScalar b2 = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2))); // upper boundary of midrange + static const RealScalar s1m = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2))); // scaling factor for lower range + static const RealScalar s2m = RealScalar(pow(RealScalar(ibeta),RealScalar(- ((iemax+it)/2)))); // scaling factor for upper range + static const RealScalar eps = RealScalar(pow(double(ibeta), 1-it)); + static const RealScalar relerr = sqrt(eps); // tolerance for neglecting asml + const Derived& vec(_vec.derived()); - static bool initialized = false; - static RealScalar b1, b2, s1m, s2m, rbig, relerr; - if(!initialized) - { - int ibeta, it, iemin, iemax, iexp; - RealScalar eps; - // This program calculates the machine-dependent constants - // bl, b2, slm, s2m, relerr overfl - // from the "basic" machine-dependent numbers - // nbig, ibeta, it, iemin, iemax, rbig. - // The following define the basic machine-dependent constants. - // For portability, the PORT subprograms "ilmaeh" and "rlmach" - // are used. For any specific computer, each of the assignment - // statements can be replaced - ibeta = std::numeric_limits::radix; // base for floating-point numbers - it = std::numeric_limits::digits; // number of base-beta digits in mantissa - iemin = std::numeric_limits::min_exponent; // minimum exponent - iemax = std::numeric_limits::max_exponent; // maximum exponent - rbig = (std::numeric_limits::max)(); // largest floating-point number - - iexp = -((1-iemin)/2); - b1 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // lower boundary of midrange - iexp = (iemax + 1 - it)/2; - b2 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // upper boundary of midrange - - iexp = (2-iemin)/2; - s1m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for lower range - iexp = - ((iemax+it)/2); - s2m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for upper range - - eps = RealScalar(pow(double(ibeta), 1-it)); - relerr = sqrt(eps); // tolerance for neglecting asml - initialized = true; - } Index n = vec.size(); RealScalar ab2 = b2 / RealScalar(n); RealScalar asml = RealScalar(0); RealScalar amed = RealScalar(0); RealScalar abig = RealScalar(0); - for(typename Derived::InnerIterator it(vec, 0); it; ++it) + + for(Index j=0; j ab2) abig += numext::abs2(ax*s2m); - else if(ax < b1) asml += numext::abs2(ax*s1m); - else amed += numext::abs2(ax); + for(typename Derived::InnerIterator iter(vec, j); iter; ++iter) + { + RealScalar ax = abs(iter.value()); + if(ax > ab2) abig += numext::abs2(ax*s2m); + else if(ax < b1) asml += numext::abs2(ax*s1m); + else amed += numext::abs2(ax); + } } if(amed!=amed) return amed; // we got a NaN @@ -156,36 +212,7 @@ template inline typename NumTraits::Scalar>::Real MatrixBase::stableNorm() const { - using std::sqrt; - using std::abs; - const Index blockSize = 4096; - RealScalar scale(0); - RealScalar invScale(1); - RealScalar ssq(0); // sum of square - - typedef typename internal::nested_eval::type DerivedCopy; - typedef typename internal::remove_all::type DerivedCopyClean; - const DerivedCopy copy(derived()); - - enum { - CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) - || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization - }; - typedef typename internal::conditional, internal::evaluator::Alignment>, - typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; - Index n = size(); - - if(n==1) - return abs(this->coeff(0)); - - Index bi = internal::first_default_aligned(copy); - if (bi>0) - internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); - for (; bi inline typename NumTraits::Scalar>::Real MatrixBase::hypotNorm() const { - return this->cwiseAbs().redux(internal::scalar_hypot_op()); + if(size()==1) + return numext::abs(coeff(0,0)); + else + return this->cwiseAbs().redux(internal::scalar_hypot_op()); } } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/StlIterators.h b/inst/include/Eigen/src/Core/StlIterators.h new file mode 100644 index 00000000..09041db1 --- /dev/null +++ b/inst/include/Eigen/src/Core/StlIterators.h @@ -0,0 +1,463 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_STLITERATORS_H +#define EIGEN_STLITERATORS_H + +namespace Eigen { + +namespace internal { + +template +struct indexed_based_stl_iterator_traits; + +template +class indexed_based_stl_iterator_base +{ +protected: + typedef indexed_based_stl_iterator_traits traits; + typedef typename traits::XprType XprType; + typedef indexed_based_stl_iterator_base non_const_iterator; + typedef indexed_based_stl_iterator_base const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class indexed_based_stl_iterator_base; + friend class indexed_based_stl_iterator_base; +public: + typedef Index difference_type; + typedef std::random_access_iterator_tag iterator_category; + + indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {} + indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {} + + indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW + : mp_xpr(other.mp_xpr), m_index(other.m_index) + {} + + indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) + { + mp_xpr = other.mp_xpr; + m_index = other.m_index; + return *this; + } + + Derived& operator++() { ++m_index; return derived(); } + Derived& operator--() { --m_index; return derived(); } + + Derived operator++(int) { Derived prev(derived()); operator++(); return prev;} + Derived operator--(int) { Derived prev(derived()); operator--(); return prev;} + + friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; } + friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; } + friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; } + friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; } + + Derived& operator+=(Index b) { m_index += b; return derived(); } + Derived& operator-=(Index b) { m_index -= b; return derived(); } + + difference_type operator-(const indexed_based_stl_iterator_base& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return m_index - other.m_index; + } + + difference_type operator-(const other_iterator& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return m_index - other.m_index; + } + + bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + + bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + +protected: + + Derived& derived() { return static_cast(*this); } + const Derived& derived() const { return static_cast(*this); } + + XprType *mp_xpr; + Index m_index; +}; + +template +class indexed_based_stl_reverse_iterator_base +{ +protected: + typedef indexed_based_stl_iterator_traits traits; + typedef typename traits::XprType XprType; + typedef indexed_based_stl_reverse_iterator_base non_const_iterator; + typedef indexed_based_stl_reverse_iterator_base const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class indexed_based_stl_reverse_iterator_base; + friend class indexed_based_stl_reverse_iterator_base; +public: + typedef Index difference_type; + typedef std::random_access_iterator_tag iterator_category; + + indexed_based_stl_reverse_iterator_base() : mp_xpr(0), m_index(0) {} + indexed_based_stl_reverse_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {} + + indexed_based_stl_reverse_iterator_base(const non_const_iterator& other) + : mp_xpr(other.mp_xpr), m_index(other.m_index) + {} + + indexed_based_stl_reverse_iterator_base& operator=(const non_const_iterator& other) + { + mp_xpr = other.mp_xpr; + m_index = other.m_index; + return *this; + } + + Derived& operator++() { --m_index; return derived(); } + Derived& operator--() { ++m_index; return derived(); } + + Derived operator++(int) { Derived prev(derived()); operator++(); return prev;} + Derived operator--(int) { Derived prev(derived()); operator--(); return prev;} + + friend Derived operator+(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; } + friend Derived operator-(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; } + friend Derived operator+(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; } + friend Derived operator-(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; } + + Derived& operator+=(Index b) { m_index -= b; return derived(); } + Derived& operator-=(Index b) { m_index += b; return derived(); } + + difference_type operator-(const indexed_based_stl_reverse_iterator_base& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return other.m_index - m_index; + } + + difference_type operator-(const other_iterator& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return other.m_index - m_index; + } + + bool operator==(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator<=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + bool operator> (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator>=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + + bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + +protected: + + Derived& derived() { return static_cast(*this); } + const Derived& derived() const { return static_cast(*this); } + + XprType *mp_xpr; + Index m_index; +}; + +template +class pointer_based_stl_iterator +{ + enum { is_lvalue = internal::is_lvalue::value }; + typedef pointer_based_stl_iterator::type> non_const_iterator; + typedef pointer_based_stl_iterator::type> const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class pointer_based_stl_iterator::type>; + friend class pointer_based_stl_iterator::type>; +public: + typedef Index difference_type; + typedef typename XprType::Scalar value_type; + typedef std::random_access_iterator_tag iterator_category; + typedef typename internal::conditional::type pointer; + typedef typename internal::conditional::type reference; + + + pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {} + pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) + { + m_ptr = xpr.data() + index * m_incr.value(); + } + + pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW + : m_ptr(other.m_ptr), m_incr(other.m_incr) + {} + + pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW + { + m_ptr = other.m_ptr; + m_incr.setValue(other.m_incr); + return *this; + } + + reference operator*() const { return *m_ptr; } + reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); } + pointer operator->() const { return m_ptr; } + + pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; } + pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; } + + pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;} + pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;} + + friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; } + friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; } + friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; } + friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; } + + pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; } + pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; } + + difference_type operator-(const pointer_based_stl_iterator& other) const { + return (m_ptr - other.m_ptr)/m_incr.value(); + } + + difference_type operator-(const other_iterator& other) const { + return (m_ptr - other.m_ptr)/m_incr.value(); + } + + bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; } + bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; } + bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr < other.m_ptr; } + bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; } + bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr > other.m_ptr; } + bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; } + + bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; } + bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; } + bool operator< (const other_iterator& other) const { return m_ptr < other.m_ptr; } + bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; } + bool operator> (const other_iterator& other) const { return m_ptr > other.m_ptr; } + bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; } + +protected: + + pointer m_ptr; + internal::variable_if_dynamic m_incr; +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef generic_randaccess_stl_iterator::type> non_const_iterator; + typedef generic_randaccess_stl_iterator::type> const_iterator; +}; + +template +class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base > +{ +public: + typedef typename XprType::Scalar value_type; + +protected: + + enum { + has_direct_access = (internal::traits::Flags & DirectAccessBit) ? 1 : 0, + is_lvalue = internal::is_lvalue::value + }; + + typedef indexed_based_stl_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + // TODO currently const Transpose/Reshape expressions never returns const references, + // so lets return by value too. + //typedef typename internal::conditional::type read_only_ref_t; + typedef const value_type read_only_ref_t; + +public: + + typedef typename internal::conditional::type pointer; + typedef typename internal::conditional::type reference; + + generic_randaccess_stl_iterator() : Base() {} + generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {} + using Base::operator=; + + reference operator*() const { return (*mp_xpr)(m_index); } + reference operator[](Index i) const { return (*mp_xpr)(m_index+i); } + pointer operator->() const { return &((*mp_xpr)(m_index)); } +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef subvector_stl_iterator::type, Direction> non_const_iterator; + typedef subvector_stl_iterator::type, Direction> const_iterator; +}; + +template +class subvector_stl_iterator : public indexed_based_stl_iterator_base > +{ +protected: + + enum { is_lvalue = internal::is_lvalue::value }; + + typedef indexed_based_stl_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + typedef typename internal::conditional::type SubVectorType; + typedef typename internal::conditional::type ConstSubVectorType; + + +public: + typedef typename internal::conditional::type reference; + typedef typename reference::PlainObject value_type; + +private: + class subvector_stl_iterator_ptr + { + public: + subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {} + reference* operator->() { return &m_subvector; } + private: + reference m_subvector; + }; +public: + + typedef subvector_stl_iterator_ptr pointer; + + subvector_stl_iterator() : Base() {} + subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + + reference operator*() const { return (*mp_xpr).template subVector(m_index); } + reference operator[](Index i) const { return (*mp_xpr).template subVector(m_index+i); } + pointer operator->() const { return (*mp_xpr).template subVector(m_index); } +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef subvector_stl_reverse_iterator::type, Direction> non_const_iterator; + typedef subvector_stl_reverse_iterator::type, Direction> const_iterator; +}; + +template +class subvector_stl_reverse_iterator : public indexed_based_stl_reverse_iterator_base > +{ +protected: + + enum { is_lvalue = internal::is_lvalue::value }; + + typedef indexed_based_stl_reverse_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + typedef typename internal::conditional::type SubVectorType; + typedef typename internal::conditional::type ConstSubVectorType; + + +public: + typedef typename internal::conditional::type reference; + typedef typename reference::PlainObject value_type; + +private: + class subvector_stl_reverse_iterator_ptr + { + public: + subvector_stl_reverse_iterator_ptr(const reference &subvector) : m_subvector(subvector) {} + reference* operator->() { return &m_subvector; } + private: + reference m_subvector; + }; +public: + + typedef subvector_stl_reverse_iterator_ptr pointer; + + subvector_stl_reverse_iterator() : Base() {} + subvector_stl_reverse_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + + reference operator*() const { return (*mp_xpr).template subVector(m_index); } + reference operator[](Index i) const { return (*mp_xpr).template subVector(m_index+i); } + pointer operator->() const { return (*mp_xpr).template subVector(m_index); } +}; + +} // namespace internal + + +/** returns an iterator to the first element of the 1D vector or array + * \only_for_vectors + * \sa end(), cbegin() + */ +template +inline typename DenseBase::iterator DenseBase::begin() +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return iterator(derived(), 0); +} + +/** const version of begin() */ +template +inline typename DenseBase::const_iterator DenseBase::begin() const +{ + return cbegin(); +} + +/** returns a read-only const_iterator to the first element of the 1D vector or array + * \only_for_vectors + * \sa cend(), begin() + */ +template +inline typename DenseBase::const_iterator DenseBase::cbegin() const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return const_iterator(derived(), 0); +} + +/** returns an iterator to the element following the last element of the 1D vector or array + * \only_for_vectors + * \sa begin(), cend() + */ +template +inline typename DenseBase::iterator DenseBase::end() +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return iterator(derived(), size()); +} + +/** const version of end() */ +template +inline typename DenseBase::const_iterator DenseBase::end() const +{ + return cend(); +} + +/** returns a read-only const_iterator to the element following the last element of the 1D vector or array + * \only_for_vectors + * \sa begin(), cend() + */ +template +inline typename DenseBase::const_iterator DenseBase::cend() const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return const_iterator(derived(), size()); +} + +} // namespace Eigen + +#endif // EIGEN_STLITERATORS_H diff --git a/inst/include/Eigen/src/Core/Stride.h b/inst/include/Eigen/src/Core/Stride.h index 513742f3..6494d514 100644 --- a/inst/include/Eigen/src/Core/Stride.h +++ b/inst/include/Eigen/src/Core/Stride.h @@ -10,7 +10,7 @@ #ifndef EIGEN_STRIDE_H #define EIGEN_STRIDE_H -namespace Eigen { +namespace Eigen { /** \class Stride * \ingroup Core_Module @@ -38,6 +38,10 @@ namespace Eigen { * \include Map_general_stride.cpp * Output: \verbinclude Map_general_stride.out * + * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime + * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were + * not allowed). + * * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders */ template @@ -55,6 +59,8 @@ class Stride Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) { + // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic. + // FIXME: for Eigen 4 we should also unify this API with fix<> eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic); } @@ -63,7 +69,6 @@ class Stride Stride(Index outerStride, Index innerStride) : m_outer(outerStride), m_inner(innerStride) { - eigen_assert(innerStride>=0 && outerStride>=0); } /** Copy constructor */ @@ -73,10 +78,10 @@ class Stride {} /** \returns the outer stride */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outer() const { return m_outer.value(); } /** \returns the inner stride */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index inner() const { return m_inner.value(); } protected: diff --git a/inst/include/Eigen/src/Core/Swap.h b/inst/include/Eigen/src/Core/Swap.h index d7020091..180a4e5a 100644 --- a/inst/include/Eigen/src/Core/Swap.h +++ b/inst/include/Eigen/src/Core/Swap.h @@ -30,12 +30,13 @@ class generic_dense_assignment_kernel Functor; - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} template - void assignPacket(Index row, Index col) + EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) { PacketType tmp = m_src.template packet(row,col); const_cast(m_src).template writePacket(row,col, m_dst.template packet(row,col)); @@ -43,7 +44,7 @@ class generic_dense_assignment_kernel - void assignPacket(Index index) + EIGEN_STRONG_INLINE void assignPacket(Index index) { PacketType tmp = m_src.template packet(index); const_cast(m_src).template writePacket(index, m_dst.template packet(index)); @@ -52,7 +53,7 @@ class generic_dense_assignment_kernel - void assignPacketByOuterInner(Index outer, Index inner) + EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { Index row = Base::rowIndexByOuterInner(outer, inner); Index col = Base::colIndexByOuterInner(outer, inner); diff --git a/inst/include/Eigen/src/Core/Transpose.h b/inst/include/Eigen/src/Core/Transpose.h index 960dc451..2bc658f4 100644 --- a/inst/include/Eigen/src/Core/Transpose.h +++ b/inst/include/Eigen/src/Core/Transpose.h @@ -11,7 +11,7 @@ #ifndef EIGEN_TRANSPOSE_H #define EIGEN_TRANSPOSE_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -61,24 +61,27 @@ template class Transpose typedef typename internal::remove_all::type NestedExpression; EIGEN_DEVICE_FUNC - explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {} + explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose) - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::remove_reference::type& nestedExpression() { return m_matrix; } /** \internal */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } @@ -122,8 +125,10 @@ template class TransposeImpl EIGEN_DENSE_PUBLIC_INTERFACE(Transpose) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl) - EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index innerStride() const { return derived().nestedExpression().innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return derived().nestedExpression().outerStride(); } typedef typename internal::conditional< internal::is_lvalue::value, @@ -131,18 +136,20 @@ template class TransposeImpl const Scalar >::type ScalarWithConstIfNotLvalue; - EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } - EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar* data() const { return derived().nestedExpression().data(); } // FIXME: shall we keep the const version of coeffRef? - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index rowId, Index colId) const { return derived().nestedExpression().coeffRef(colId, rowId); } - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index index) const { return derived().nestedExpression().coeffRef(index); } @@ -170,7 +177,8 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -inline Transpose +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -182,7 +190,8 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -inline typename DenseBase::ConstTransposeReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); @@ -208,7 +217,7 @@ DenseBase::transpose() const * * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ template -inline const typename MatrixBase::AdjointReturnType +EIGEN_DEVICE_FUNC inline const typename MatrixBase::AdjointReturnType MatrixBase::adjoint() const { return AdjointReturnType(this->transpose()); @@ -230,11 +239,10 @@ struct inplace_transpose_selector; template struct inplace_transpose_selector { // square matrix static void run(MatrixType& m) { - m.matrix().template triangularView().swap(m.matrix().transpose()); + m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); } }; -// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template struct inplace_transpose_selector { // PacketSize x PacketSize static void run(MatrixType& m) { @@ -251,16 +259,66 @@ struct inplace_transpose_selector { // PacketSize x Packet } }; + +template +void BlockedInPlaceTranspose(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + const Index PacketSize = internal::packet_traits::size; + eigen_assert(m.rows() == m.cols()); + int row_start = 0; + for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) { + for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) { + PacketBlock A; + if (row_start == col_start) { + for (Index i=0; i(row_start + i,col_start); + internal::ptranspose(A); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]); + } else { + PacketBlock B; + for (Index i=0; i(row_start + i,col_start); + B.packet[i] = m.template packetByOuterInner(col_start + i, row_start); + } + internal::ptranspose(A); + internal::ptranspose(B); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]); + m.template writePacket(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]); + } + } + } + } + for (Index row = row_start; row < m.rows(); ++row) { + m.matrix().row(row).head(row).swap( + m.matrix().col(row).head(row).transpose()); + } +} + template -struct inplace_transpose_selector { // non square matrix +struct inplace_transpose_selector { // non square or dynamic matrix static void run(MatrixType& m) { - if (m.rows()==m.cols()) - m.matrix().template triangularView().swap(m.matrix().transpose()); - else + typedef typename MatrixType::Scalar Scalar; + if (m.rows() == m.cols()) { + const Index PacketSize = internal::packet_traits::size; + if (!NumTraits::IsComplex && m.rows() >= PacketSize) { + if ((m.rows() % PacketSize) == 0) + BlockedInPlaceTranspose::Alignment>(m); + else + BlockedInPlaceTranspose(m); + } + else { + m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); + } + } else { m = m.transpose().eval(); + } } }; + } // end namespace internal /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose. @@ -278,12 +336,12 @@ struct inplace_transpose_selector { // non squ * Notice however that this method is only useful if you want to replace a matrix by its own transpose. * If you just need the transpose of a matrix, use transpose(). * - * \note if the matrix is not square, then \c *this must be a resizable matrix. + * \note if the matrix is not square, then \c *this must be a resizable matrix. * This excludes (non-square) fixed-size matrices, block-expressions and maps. * * \sa transpose(), adjoint(), adjointInPlace() */ template -inline void DenseBase::transposeInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() { eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) && "transposeInPlace() called on a non-square non-resizable matrix"); @@ -314,7 +372,7 @@ inline void DenseBase::transposeInPlace() * * \sa transpose(), adjoint(), transposeInPlace() */ template -inline void MatrixBase::adjointInPlace() +EIGEN_DEVICE_FUNC inline void MatrixBase::adjointInPlace() { derived() = adjoint().eval(); } @@ -393,7 +451,8 @@ struct checkTransposeAliasing_impl template void check_for_aliasing(const Dst &dst, const Src &src) { - internal::checkTransposeAliasing_impl::run(dst, src); + if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1) + internal::checkTransposeAliasing_impl::run(dst, src); } } // end namespace internal diff --git a/inst/include/Eigen/src/Core/Transpositions.h b/inst/include/Eigen/src/Core/Transpositions.h index 7718625e..38a7b01c 100644 --- a/inst/include/Eigen/src/Core/Transpositions.h +++ b/inst/include/Eigen/src/Core/Transpositions.h @@ -10,20 +10,22 @@ #ifndef EIGEN_TRANSPOSITIONS_H #define EIGEN_TRANSPOSITIONS_H -namespace Eigen { +namespace Eigen { template class TranspositionsBase { typedef internal::traits Traits; - + public: typedef typename Traits::IndicesType IndicesType; typedef typename IndicesType::Scalar StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 + EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast(this); } /** Copies the \a other transpositions into \c *this */ @@ -35,13 +37,17 @@ class TranspositionsBase } /** \returns the number of transpositions */ + EIGEN_DEVICE_FUNC Index size() const { return indices().size(); } /** \returns the number of rows of the equivalent permutation matrix */ + EIGEN_DEVICE_FUNC Index rows() const { return indices().size(); } /** \returns the number of columns of the equivalent permutation matrix */ + EIGEN_DEVICE_FUNC Index cols() const { return indices().size(); } /** Direct access to the underlying index vector */ + EIGEN_DEVICE_FUNC inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); } /** Direct access to the underlying index vector */ inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); } @@ -55,8 +61,10 @@ class TranspositionsBase inline StorageIndex& operator[](Index i) { return indices()(i); } /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return derived().indices(); } /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return derived().indices(); } /** Resizes to given size. */ @@ -73,7 +81,7 @@ class TranspositionsBase } // FIXME: do we want such methods ? - // might be usefull when the target matrix expression is complex, e.g.: + // might be useful when the target matrix expression is complex, e.g.: // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); /* template @@ -178,8 +186,10 @@ class Transpositions : public TranspositionsBase,P #endif /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; } - + /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; } protected: @@ -279,9 +291,11 @@ class TranspositionsWrapper } /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; } /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; } protected: @@ -335,9 +349,12 @@ class Transpose > explicit Transpose(const TranspositionType& t) : m_transpositions(t) {} - Index size() const { return m_transpositions.size(); } - Index rows() const { return m_transpositions.size(); } - Index cols() const { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); } /** \returns the \a matrix with the inverse transpositions applied to the columns. */ @@ -356,7 +373,8 @@ class Transpose > { return Product(*this, matrix.derived()); } - + + EIGEN_DEVICE_FUNC const TranspositionType& nestedExpression() const { return m_transpositions; } protected: diff --git a/inst/include/Eigen/src/Core/TriangularMatrix.h b/inst/include/Eigen/src/Core/TriangularMatrix.h index 9abb7e31..fdb8bc15 100644 --- a/inst/include/Eigen/src/Core/TriangularMatrix.h +++ b/inst/include/Eigen/src/Core/TriangularMatrix.h @@ -11,12 +11,12 @@ #ifndef EIGEN_TRIANGULARMATRIX_H #define EIGEN_TRIANGULARMATRIX_H -namespace Eigen { +namespace Eigen { namespace internal { - + template struct triangular_solve_retval; - + } /** \class TriangularBase @@ -34,16 +34,16 @@ template class TriangularBase : public EigenBase ColsAtCompileTime = internal::traits::ColsAtCompileTime, MaxRowsAtCompileTime = internal::traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = internal::traits::MaxColsAtCompileTime, - + SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, internal::traits::ColsAtCompileTime>::ret), /**< This is equal to the number of coefficients, i.e. the number of * rows times the number of columns, or to \a Dynamic if this is not * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ - + MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret) - + }; typedef typename internal::traits::Scalar Scalar; typedef typename internal::traits::StorageKind StorageKind; @@ -53,18 +53,19 @@ template class TriangularBase : public EigenBase typedef Derived const& Nested; EIGEN_DEVICE_FUNC - inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); } + inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } - EIGEN_DEVICE_FUNC - inline Index rows() const { return derived().rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return derived().cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return derived().outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return derived().innerStride(); } - // dummy resize function + EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -155,7 +156,7 @@ template class TriangularBase : public EigenBase * \param MatrixType the type of the object in which we are taking the triangular part * \param Mode the kind of triangular matrix expression to construct. Can be #Upper, * #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower. - * This is in fact a bit field; it must have either #Upper or #Lower, + * This is in fact a bit field; it must have either #Upper or #Lower, * and additionally it may have #UnitDiag or #ZeroDiag or neither. * * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular @@ -197,7 +198,8 @@ template class TriangularView typedef typename internal::traits::MatrixTypeNestedNonRef MatrixTypeNestedNonRef; typedef typename internal::remove_all::type MatrixConjugateReturnType; - + typedef TriangularView::type, _Mode> ConstTriangularView; + public: typedef typename internal::traits::StorageKind StorageKind; @@ -216,15 +218,15 @@ template class TriangularView EIGEN_DEVICE_FUNC explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} - + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView) /** \copydoc EigenBase::rows() */ - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } /** \copydoc EigenBase::cols() */ - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \returns a const reference to the nested expression */ EIGEN_DEVICE_FUNC @@ -233,13 +235,25 @@ template class TriangularView /** \returns a reference to the nested expression */ EIGEN_DEVICE_FUNC NestedExpression& nestedExpression() { return m_matrix; } - + typedef TriangularView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef TriangularView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -255,7 +269,7 @@ template class TriangularView typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } - + typedef TriangularView ConstTransposeReturnType; /** \sa MatrixBase::transpose() const */ EIGEN_DEVICE_FUNC @@ -266,10 +280,10 @@ template class TriangularView template EIGEN_DEVICE_FUNC - inline const Solve + inline const Solve solve(const MatrixBase& other) const { return Solve(*this, other.derived()); } - + // workaround MSVC ICE #if EIGEN_COMP_MSVC template @@ -313,7 +327,7 @@ template class TriangularView else return m_matrix.diagonal().prod(); } - + protected: MatrixTypeNested m_matrix; @@ -375,7 +389,7 @@ template class TriangularViewImpl<_Mat internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op()); return derived(); } - + /** \sa MatrixBase::operator*=() */ EIGEN_DEVICE_FUNC TriangularViewType& operator*=(const typename internal::traits::Scalar& other) { return *this = derived().nestedExpression() * other; } @@ -433,14 +447,14 @@ template class TriangularViewImpl<_Mat TriangularViewType& operator=(const TriangularViewImpl& other) { return *this = other.derived().nestedExpression(); } - /** \deprecated */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const TriangularBase& other); - /** \deprecated */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const MatrixBase& other); #endif @@ -468,7 +482,7 @@ template class TriangularViewImpl<_Mat * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheRight. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this @@ -486,7 +500,6 @@ template class TriangularViewImpl<_Mat * \sa TriangularView::solveInPlace() */ template - EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval solve(const MatrixBase& other) const; @@ -495,7 +508,7 @@ template class TriangularViewImpl<_Mat * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * This function will const_cast it, so constness isn't honored here. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * See TriangularView:solve() for the details. */ @@ -521,10 +534,10 @@ template class TriangularViewImpl<_Mat call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op()); } - /** \deprecated - * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ + /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void swap(MatrixBase const & other) { EIGEN_STATIC_ASSERT_LVALUE(OtherDerived); @@ -556,7 +569,7 @@ template class TriangularViewImpl<_Mat // FIXME should we keep that possibility template template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op()); @@ -566,7 +579,7 @@ TriangularViewImpl::operator=(const MatrixBase template -void TriangularViewImpl::lazyAssign(const MatrixBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.template triangularView()); } @@ -575,7 +588,7 @@ void TriangularViewImpl::lazyAssign(const MatrixBase template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); @@ -585,7 +598,7 @@ TriangularViewImpl::operator=(const TriangularBase template -void TriangularViewImpl::lazyAssign(const TriangularBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); internal::call_assignment_no_alias(derived(), other.derived()); @@ -600,7 +613,7 @@ void TriangularViewImpl::lazyAssign(const TriangularBas * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalTo(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase &other) const { evalToLazy(other.derived()); } @@ -626,6 +639,7 @@ void TriangularBase::evalTo(MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template TriangularViewReturnType::Type MatrixBase::triangularView() { @@ -635,6 +649,7 @@ MatrixBase::triangularView() /** This is the const version of MatrixBase::triangularView() */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstTriangularViewReturnType::Type MatrixBase::triangularView() const { @@ -700,7 +715,7 @@ bool MatrixBase::isLowerTriangular(const RealScalar& prec) const namespace internal { - + // TODO currently a triangular expression has the form TriangularView<.,.> // in the future triangular-ness should be defined by the expression traits // such that Transpose > is valid. (currently TriangularBase::transpose() is overloaded to make it work) @@ -717,6 +732,7 @@ struct unary_evaluator, IndexBased> { typedef TriangularView XprType; typedef evaluator::type> Base; + EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; @@ -728,7 +744,7 @@ struct Dense2Triangular {}; template struct triangular_assignment_loop; - + /** \internal Specialization of the dense assignment kernel for triangular matrices. * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions. * \tparam UpLo must be either Lower or Upper @@ -745,17 +761,17 @@ class triangular_dense_assignment_kernel : public generic_dense_assignment_kerne using Base::m_src; using Base::m_functor; public: - + typedef typename Base::DstEvaluatorType DstEvaluatorType; typedef typename Base::SrcEvaluatorType SrcEvaluatorType; typedef typename Base::Scalar Scalar; typedef typename Base::AssignmentTraits AssignmentTraits; - - + + EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} - + #ifdef EIGEN_INTERNAL_DEBUGGING EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col) { @@ -765,16 +781,16 @@ class triangular_dense_assignment_kernel : public generic_dense_assignment_kerne #else using Base::assignCoeff; #endif - + EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) { if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1)); else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0)); else if(Mode==0) Base::assignCoeff(id,id); } - + EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col) - { + { eigen_internal_assert(row!=col); if(SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0)); @@ -795,17 +811,17 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) dst.resize(dstRows, dstCols); DstEvaluatorType dstEvaluator(dst); - + typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite, DstEvaluatorType,SrcEvaluatorType,Functor> Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); - + enum { unroll = DstXprType::SizeAtCompileTime != Dynamic && SrcEvaluatorType::CoeffReadCost < HugeCost - && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT + && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT }; - + triangular_assignment_loop::run(kernel); } @@ -827,8 +843,8 @@ struct Assignment EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode)); - - call_triangular_assignment_loop(dst, src, func); + + call_triangular_assignment_loop(dst, src, func); } }; @@ -837,7 +853,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -846,7 +862,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -857,19 +873,19 @@ struct triangular_assignment_loop // FIXME: this is not very clean, perhaps this information should be provided by the kernel? typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; - + enum { col = (UnrollCount-1) / DstXprType::RowsAtCompileTime, row = (UnrollCount-1) % DstXprType::RowsAtCompileTime }; - + typedef typename Kernel::Scalar Scalar; EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) { triangular_assignment_loop::run(kernel); - + if(row==col) kernel.assignDiagonalCoeff(row); else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row } else i = maxi; - + if(i * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalToLazy(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); - internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); + internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); } namespace internal { - + // Triangular = Product template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> struct Assignment, internal::assign_op::Scalar>, Dense2Triangular> @@ -952,7 +968,7 @@ struct Assignment, internal::assign_ if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) dst.resize(dstRows, dstCols); - dst._assignProduct(src, 1, 0); + dst._assignProduct(src, Scalar(1), false); } }; @@ -963,7 +979,7 @@ struct Assignment, internal::add_ass typedef Product SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { - dst._assignProduct(src, 1, 1); + dst._assignProduct(src, Scalar(1), true); } }; @@ -974,7 +990,7 @@ struct Assignment, internal::sub_ass typedef Product SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { - dst._assignProduct(src, -1, 1); + dst._assignProduct(src, Scalar(-1), true); } }; diff --git a/inst/include/Eigen/src/Core/VectorBlock.h b/inst/include/Eigen/src/Core/VectorBlock.h index d72fbf7e..71c5b95e 100644 --- a/inst/include/Eigen/src/Core/VectorBlock.h +++ b/inst/include/Eigen/src/Core/VectorBlock.h @@ -35,7 +35,7 @@ struct traits > * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and * most of the time this is the only way it is used. * - * However, if you want to directly maniputate sub-vector expressions, + * However, if you want to directly manipulate sub-vector expressions, * for instance if you want to write a function returning such an expression, you * will need to use this class. * @@ -71,8 +71,8 @@ template class VectorBlock /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start, Index size) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start, Index size) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) @@ -82,8 +82,8 @@ template class VectorBlock /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); diff --git a/inst/include/Eigen/src/Core/VectorwiseOp.h b/inst/include/Eigen/src/Core/VectorwiseOp.h index 4fe267e9..870f4f1e 100644 --- a/inst/include/Eigen/src/Core/VectorwiseOp.h +++ b/inst/include/Eigen/src/Core/VectorwiseOp.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2010 Gael Guennebaud +// Copyright (C) 2008-2019 Gael Guennebaud // Copyright (C) 2006-2008 Benoit Jacob // // This Source Code Form is subject to the terms of the Mozilla @@ -65,10 +65,10 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr \ - struct member_##MEMBER { \ - EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ - typedef ResultType result_type; \ - template struct Cost \ - { enum { value = COST }; }; \ - template \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ - ResultType operator()(const XprType& mat) const \ - { return mat.MEMBER(); } \ +template struct partial_redux_dummy_func; + +#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP) \ + template \ + struct member_##MEMBER { \ + EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ + typedef ResultType result_type; \ + typedef BINARYOP BinaryOp; \ + template struct Cost { enum { value = COST }; }; \ + enum { Vectorizable = VECTORIZABLE }; \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ + ResultType operator()(const XprType& mat) const \ + { return mat.MEMBER(); } \ + BinaryOp binaryFunc() const { return BinaryOp(); } \ } +#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \ + EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func) + namespace internal { -EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits >::Cost ); -EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits::AddCost + NumTraits::MulCost); -EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits::MulCost); -template +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits::AddCost, 1, internal::scalar_sum_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_min_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_max_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits::MulCost, 1, internal::scalar_product_op); + +template struct member_lpnorm { typedef ResultType result_type; - template struct Cost + enum { Vectorizable = 0 }; + template struct Cost { enum { value = (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost }; }; EIGEN_DEVICE_FUNC member_lpnorm() {} template @@ -121,17 +128,20 @@ struct member_lpnorm { { return mat.template lpNorm

(); } }; -template +template struct member_redux { + typedef BinaryOpT BinaryOp; typedef typename result_of< BinaryOp(const Scalar&,const Scalar&) >::type result_type; - template struct Cost - { enum { value = (Size-1) * functor_traits::Cost }; }; + + enum { Vectorizable = functor_traits::PacketAccess }; + template struct Cost { enum { value = (Size-1) * functor_traits::Cost }; }; EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {} template EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase& mat) const { return mat.redux(m_functor); } + const BinaryOp& binaryFunc() const { return m_functor; } const BinaryOp m_functor; }; } @@ -139,18 +149,38 @@ struct member_redux { /** \class VectorwiseOp * \ingroup Core_Module * - * \brief Pseudo expression providing partial reduction operations + * \brief Pseudo expression providing broadcasting and partial reduction operations * * \tparam ExpressionType the type of the object on which to do partial reductions - * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal) + * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal) * - * This class represents a pseudo expression with partial reduction features. + * This class represents a pseudo expression with broadcasting and partial reduction features. * It is the return type of DenseBase::colwise() and DenseBase::rowwise() - * and most of the time this is the only way it is used. + * and most of the time this is the only way it is explicitly used. + * + * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()` + * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each + * column of `A` and then re-assemble the outputs in a matrix expression: + * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out * + * The begin() and end() methods are obviously exceptions to the previous rule as they + * return STL-compatible begin/end iterators to the rows or columns of the nested expression. + * Typical use cases include for-range-loop and calls to STL algorithms: + * + * Example: \include MatrixBase_colwise_iterator_cxx11.cpp + * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out + * + * For a partial reduction on an empty input, some rules apply. + * For the sake of clarity, let's consider a vertical reduction: + * - If the number of columns is zero, then a 1x0 row-major vector expression is returned. + * - Otherwise, if the number of rows is zero, then + * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.) + * - a row vector of ones is returned for a product reduction (e.g., MatrixXd(n,0).colwise().prod()) + * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op)) + * * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr */ template class VectorwiseOp @@ -163,11 +193,11 @@ template class VectorwiseOp typedef typename internal::ref_selector::non_const_type ExpressionTypeNested; typedef typename internal::remove_all::type ExpressionTypeNestedCleaned; - template class Functor, - typename Scalar_=Scalar> struct ReturnType + template class Functor, + typename ReturnScalar=Scalar> struct ReturnType { typedef PartialReduxExpr, + Functor, Direction > Type; }; @@ -187,23 +217,6 @@ template class VectorwiseOp protected: - typedef typename internal::conditional::type SubVector; - /** \internal - * \returns the i-th subvector according to the \c Direction */ - EIGEN_DEVICE_FUNC - SubVector subVector(Index i) - { - return SubVector(m_matrix.derived(),i); - } - - /** \internal - * \returns the number of subvectors in the direction \c Direction */ - EIGEN_DEVICE_FUNC - Index subVectors() const - { return isVertical?m_matrix.cols():m_matrix.rows(); } - template struct ExtendedType { typedef Replicate class VectorwiseOp EIGEN_DEVICE_FUNC inline const ExpressionType& _expression() const { return m_matrix; } + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** STL-like RandomAccessIterator + * iterator type over the columns or rows as returned by the begin() and end() methods. + */ + random_access_iterator_type iterator; + /** This is the const version of iterator (aka read-only) */ + random_access_iterator_type const_iterator; + #else + typedef internal::subvector_stl_iterator iterator; + typedef internal::subvector_stl_iterator const_iterator; + typedef internal::subvector_stl_reverse_iterator reverse_iterator; + typedef internal::subvector_stl_reverse_iterator const_reverse_iterator; + #endif + + /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression. + * \sa end(), cbegin() + */ + iterator begin() { return iterator (m_matrix, 0); } + /** const version of begin() */ + const_iterator begin() const { return const_iterator(m_matrix, 0); } + /** const version of begin() */ + const_iterator cbegin() const { return const_iterator(m_matrix, 0); } + + /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression. + * \sa rend(), crbegin() + */ + reverse_iterator rbegin() { return reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + /** const version of rbegin() */ + const_reverse_iterator rbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + /** const version of rbegin() */ + const_reverse_iterator crbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + + /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression + * \sa begin(), cend() + */ + iterator end() { return iterator (m_matrix, m_matrix.template subVectors()); } + /** const version of end() */ + const_iterator end() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } + /** const version of end() */ + const_iterator cend() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } + + /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression + * \sa begin(), cend() + */ + reverse_iterator rend() { return reverse_iterator (m_matrix, -1); } + /** const version of rend() */ + const_reverse_iterator rend() const { return const_reverse_iterator (m_matrix, -1); } + /** const version of rend() */ + const_reverse_iterator crend() const { return const_reverse_iterator (m_matrix, -1); } + /** \returns a row or column vector expression of \c *this reduxed by \a func * * The template parameter \a BinaryOp is the type of the functor * of the custom redux operator. Note that func must be an associative operator. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise() */ template EIGEN_DEVICE_FUNC const typename ReduxReturnType::Type redux(const BinaryOp& func = BinaryOp()) const - { return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); + } typedef typename ReturnType::Type MinCoeffReturnType; typedef typename ReturnType::Type MaxCoeffReturnType; - typedef typename ReturnType::Type SquaredNormReturnType; - typedef typename ReturnType::Type NormReturnType; + typedef PartialReduxExpr, const ExpressionTypeNestedCleaned>,internal::member_sum,Direction> SquaredNormReturnType; + typedef CwiseUnaryOp, const SquaredNormReturnType> NormReturnType; typedef typename ReturnType::Type BlueNormReturnType; typedef typename ReturnType::Type StableNormReturnType; typedef typename ReturnType::Type HypotNormReturnType; typedef typename ReturnType::Type SumReturnType; - typedef typename ReturnType::Type MeanReturnType; + typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType; typedef typename ReturnType::Type AllReturnType; typedef typename ReturnType::Type AnyReturnType; - typedef PartialReduxExpr, Direction> CountReturnType; + typedef PartialReduxExpr, Direction> CountReturnType; typedef typename ReturnType::Type ProdReturnType; typedef Reverse ConstReverseReturnType; typedef Reverse ReverseReturnType; template struct LpNormReturnType { - typedef PartialReduxExpr,Direction> Type; + typedef PartialReduxExpr,Direction> Type; }; /** \returns a row (or column) vector expression of the smallest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_minCoeff.cpp @@ -302,11 +374,17 @@ template class VectorwiseOp * \sa DenseBase::minCoeff() */ EIGEN_DEVICE_FUNC const MinCoeffReturnType minCoeff() const - { return MinCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MinCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the largest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_maxCoeff.cpp @@ -315,7 +393,10 @@ template class VectorwiseOp * \sa DenseBase::maxCoeff() */ EIGEN_DEVICE_FUNC const MaxCoeffReturnType maxCoeff() const - { return MaxCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MaxCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the squared norm * of each column (or row) of the referenced expression. @@ -327,7 +408,7 @@ template class VectorwiseOp * \sa DenseBase::squaredNorm() */ EIGEN_DEVICE_FUNC const SquaredNormReturnType squaredNorm() const - { return SquaredNormReturnType(_expression()); } + { return SquaredNormReturnType(m_matrix.cwiseAbs2()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -339,7 +420,7 @@ template class VectorwiseOp * \sa DenseBase::norm() */ EIGEN_DEVICE_FUNC const NormReturnType norm() const - { return NormReturnType(_expression()); } + { return NormReturnType(squaredNorm()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -404,7 +485,7 @@ template class VectorwiseOp * \sa DenseBase::mean() */ EIGEN_DEVICE_FUNC const MeanReturnType mean() const - { return MeanReturnType(_expression()); } + { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); } /** \returns a row (or column) vector expression representing * whether \b all coefficients of each respective column (or row) are \c true. @@ -500,7 +581,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME - return const_cast(m_matrix = extendedTo(other.derived())); + return m_matrix = extendedTo(other.derived()); } /** Adds the vector \a other to each subvector of \c *this */ @@ -510,7 +591,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix += extendedTo(other.derived())); + return m_matrix += extendedTo(other.derived()); } /** Substracts the vector \a other to each subvector of \c *this */ @@ -520,7 +601,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix -= extendedTo(other.derived())); + return m_matrix -= extendedTo(other.derived()); } /** Multiples each subvector of \c *this by the vector \a other */ @@ -532,7 +613,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix *= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Divides each subvector of \c *this by the vector \a other */ @@ -544,7 +625,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix /= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */ @@ -609,7 +690,7 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC CwiseBinaryOp, const ExpressionTypeNestedCleaned, - const typename OppositeExtendedType::Type>::Type> + const typename OppositeExtendedType::Type> normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); } @@ -658,7 +739,15 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC const HNormalizedReturnType hnormalized() const; +# ifdef EIGEN_VECTORWISEOP_PLUGIN +# include EIGEN_VECTORWISEOP_PLUGIN +# endif + protected: + Index redux_length() const + { + return Direction==Vertical ? m_matrix.rows() : m_matrix.cols(); + } ExpressionTypeNested m_matrix; }; @@ -670,7 +759,7 @@ template class VectorwiseOp * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::ColwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ColwiseReturnType DenseBase::colwise() { return ColwiseReturnType(derived()); @@ -684,7 +773,7 @@ DenseBase::colwise() * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::RowwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::RowwiseReturnType DenseBase::rowwise() { return RowwiseReturnType(derived()); diff --git a/inst/include/Eigen/src/Core/Visitor.h b/inst/include/Eigen/src/Core/Visitor.h index 54c1883d..00bcca87 100644 --- a/inst/include/Eigen/src/Core/Visitor.h +++ b/inst/include/Eigen/src/Core/Visitor.h @@ -10,7 +10,7 @@ #ifndef EIGEN_VISITOR_H #define EIGEN_VISITOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -40,6 +40,14 @@ struct visitor_impl } }; +// This specialization enables visitors on empty matrices at compile-time +template +struct visitor_impl { + EIGEN_DEVICE_FUNC + static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/) + {} +}; + template struct visitor_impl { @@ -62,22 +70,22 @@ class visitor_evaluator public: EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - + enum { RowsAtCompileTime = XprType::RowsAtCompileTime, CoeffReadCost = internal::evaluator::CoeffReadCost }; - - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const { return m_evaluator.coeff(row, col); } - + protected: internal::evaluator m_evaluator; const XprType &m_xpr; @@ -99,6 +107,8 @@ class visitor_evaluator * \note compared to one or two \em for \em loops, visitors offer automatic * unrolling for small fixed size matrix. * + * \note if the matrix is empty, then the visitor is left unchanged. + * * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux() */ template @@ -106,12 +116,15 @@ template EIGEN_DEVICE_FUNC void DenseBase::visit(Visitor& visitor) const { + if(size()==0) + return; + typedef typename internal::visitor_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - + enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits::Cost <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits::Cost) <= EIGEN_UNROLLING_LIMIT }; return internal::visitor_impl::run(thisEval, visitor); } @@ -124,6 +137,9 @@ namespace internal { template struct coeff_visitor { + // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc + EIGEN_DEVICE_FUNC + coeff_visitor() : row(-1), col(-1), res(0) {} typedef typename Derived::Scalar Scalar; Index row, col; Scalar res; @@ -141,7 +157,7 @@ struct coeff_visitor * * \sa DenseBase::minCoeff(Index*, Index*) */ -template +template struct min_coeff_visitor : coeff_visitor { typedef typename Derived::Scalar Scalar; @@ -157,8 +173,40 @@ struct min_coeff_visitor : coeff_visitor } }; -template -struct functor_traits > { +template +struct min_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res)) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct min_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(value) || value < this->res) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template + struct functor_traits > { enum { Cost = NumTraits::AddCost }; @@ -169,10 +217,10 @@ struct functor_traits > { * * \sa DenseBase::maxCoeff(Index*, Index*) */ -template +template struct max_coeff_visitor : coeff_visitor { - typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar Scalar; EIGEN_DEVICE_FUNC void operator() (const Scalar& value, Index i, Index j) { @@ -185,8 +233,40 @@ struct max_coeff_visitor : coeff_visitor } }; -template -struct functor_traits > { +template +struct max_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res)) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct max_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(value) || value > this->res) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct functor_traits > { enum { Cost = NumTraits::AddCost }; @@ -196,17 +276,24 @@ struct functor_traits > { /** \fn DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const * \returns the minimum of all coefficients of *this and puts in *row and *col its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const { - internal::min_coeff_visitor minVisitor; + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + + internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *rowId = minVisitor.row; if (colId) *colId = minVisitor.col; @@ -214,18 +301,25 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const } /** \returns the minimum of all coefficients of *this and puts in *index its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::min_coeff_visitor minVisitor; + internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row); return minVisitor.res; @@ -233,17 +327,24 @@ DenseBase::minCoeff(IndexType* index) const /** \fn DenseBase::maxCoeff(IndexType* rowId, IndexType* colId) const * \returns the maximum of all coefficients of *this and puts in *row and *col its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const { - internal::max_coeff_visitor maxVisitor; + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + + internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *rowPtr = maxVisitor.row; if (colPtr) *colPtr = maxVisitor.col; @@ -251,18 +352,25 @@ DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const } /** \returns the maximum of all coefficients of *this and puts in *index its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::max_coeff_visitor maxVisitor; + internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row; return maxVisitor.res; diff --git a/inst/include/Eigen/src/Core/arch/AVX/Complex.h b/inst/include/Eigen/src/Core/arch/AVX/Complex.h index 7fa61969..ab7bd6c6 100644 --- a/inst/include/Eigen/src/Core/arch/AVX/Complex.h +++ b/inst/include/Eigen/src/Core/arch/AVX/Complex.h @@ -22,6 +22,7 @@ struct Packet4cf __m256 v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet4cf type; @@ -37,6 +38,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -44,8 +46,20 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet8f as_real; + enum { + size=4, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -67,10 +81,17 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con return Packet4cf(result); } +template <> +EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { + __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } @@ -140,70 +161,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe Packet2cf(_mm256_extractf128_ps(a.v,1)))); } -template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) -{ - Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); - Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); - t0 = _mm256_hadd_ps(t0,t1); - Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); - Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); - t2 = _mm256_hadd_ps(t2,t3); - - t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); - t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); - - return Packet4cf(_mm256_add_ps(t1,t3)); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) { return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) @@ -228,6 +191,7 @@ struct Packet2cd __m256d v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet2cd type; @@ -243,6 +207,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -250,8 +215,20 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet4d as_real; + enum { + size=2, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } @@ -272,10 +249,17 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con return Packet2cd(_mm256_addsub_pd(even, odd)); } +template <> +EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { + __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } @@ -327,63 +311,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) -{ - Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); - Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); - - return Packet2cd(_mm256_add_pd(t0,t1)); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) { return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) @@ -424,24 +357,12 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0].v = tmp; } -template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex b) -{ - return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,1|2)); -} - -template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex b) -{ - return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,1|2)); +template<> EIGEN_STRONG_INLINE Packet2cd psqrt(const Packet2cd& a) { + return psqrt_complex(a); } -template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex b) -{ - return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,(1<<7)|(1<<6))); -} - -template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex b) -{ - return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,(1<<3)|(1<<2))); +template<> EIGEN_STRONG_INLINE Packet4cf psqrt(const Packet4cf& a) { + return psqrt_complex(a); } } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/AVX/MathFunctions.h b/inst/include/Eigen/src/Core/arch/AVX/MathFunctions.h index 6af67ce2..67041c81 100644 --- a/inst/include/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/inst/include/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,7 +10,7 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -/* The sin, cos, exp, and log functions of this file are loosely derived from +/* The sin and cos functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -18,187 +18,50 @@ namespace Eigen { namespace internal { -inline Packet8i pshiftleft(Packet8i v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(v, n); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +psin(const Packet8f& _x) { + return psin_float(_x); } -inline Packet8f pshiftright(Packet8f v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); - return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +pcos(const Packet8f& _x) { + return pcos_float(_x); } -// Sine function -// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and -// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants -// are (anti-)symmetric and thus have only odd/even coefficients template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -psin(const Packet8f& _x) { - Packet8f x = _x; +plog(const Packet8f& _x) { + return plog_float(_x); +} - // Some useful values. - _EIGEN_DECLARE_CONST_Packet8i(one, 1); - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); - _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); - - // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period. - Packet8f z = pmul(x, p8f_one_over_pi); - Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); - x = pmadd(shift, p8f_neg_pi_first, x); - x = pmadd(shift, p8f_neg_pi_second, x); - x = pmadd(shift, p8f_neg_pi_third, x); - z = pmul(x, p8f_four_over_pi); - - // Make a mask for the entries that need flipping, i.e. wherever the shift - // is odd. - Packet8i shift_ints = _mm256_cvtps_epi32(shift); - Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); - Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); - - // Create a mask for which interpolant to use, i.e. if z > 1, then the mask - // is set to ones for that entry. - Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); - - // Evaluate the polynomial for the interval [1,3] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); - Packet8f z_minus_two = psub(z, p8f_two); - Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); - Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); - right = pmadd(right, z_minus_two2, p8f_coeff_right_2); - right = pmadd(right, z_minus_two2, p8f_coeff_right_0); - - // Evaluate the polynomial for the interval [-1,1] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); - Packet8f z2 = pmul(z, z); - Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); - left = pmadd(left, z2, p8f_coeff_left_3); - left = pmadd(left, z2, p8f_coeff_left_1); - left = pmul(left, z); - - // Assemble the results, i.e. select the left and right polynomials. - left = _mm256_andnot_ps(ival_mask, left); - right = _mm256_and_ps(ival_mask, right); - Packet8f res = _mm256_or_ps(left, right); - - // Flip the sign on the odd intervals and return the result. - res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); - return res; +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +plog(const Packet4d& _x) { + return plog_double(_x); } -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -// TODO(gonnet): Further reduce the interval allowing for lower-degree -// polynomial interpolants -> ... -> profit! template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -plog(const Packet8f& _x) { - Packet8f x = _x; - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); - - Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN - Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p8f_min_norm_pos); - - Packet8f emm0 = pshiftright(x,23); - Packet8f e = _mm256_sub_ps(emm0, p8f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm256_and_ps(x, p8f_inv_mant_mask); - x = _mm256_or_ps(x, p8f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); - Packet8f tmp = _mm256_and_ps(x, mask); - x = psub(x, p8f_1); - e = psub(e, _mm256_and_ps(p8f_1, mask)); - x = padd(x, tmp); - - Packet8f x2 = pmul(x, x); - Packet8f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet8f y, y1, y2; - y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); - y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); - y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); - y = pmadd(y, x, p8f_cephes_log_p2); - y1 = pmadd(y1, x, p8f_cephes_log_p5); - y2 = pmadd(y2, x, p8f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p8f_cephes_log_q1); - tmp = pmul(x2, p8f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p8f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm256_or_ps( - _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), - _mm256_and_ps(iszero_mask, p8f_minus_inf)); +plog2(const Packet8f& _x) { + return plog2_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +plog2(const Packet4d& _x) { + return plog2_double(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f plog1p(const Packet8f& _x) { + return generic_plog1p(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f pexpm1(const Packet8f& _x) { + return generic_expm1(_x); } // Exponential function. Works by writing "x = m*log(2) + r" where @@ -207,149 +70,21 @@ plog(const Packet8f& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f pexp(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); - - _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); - - // Clamp x. - Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); - -// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is -// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating -// truncation errors. Note that we don't use the "pmadd" function here to -// ensure that a precision-preserving FMA instruction is used. -#ifdef EIGEN_VECTORIZE_FMA - _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); - Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); -#else - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); - Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); - r = psub(r, pmul(m, p8f_cephes_exp_C2)); -#endif - - Packet8f r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet8f y = p8f_cephes_exp_p0; - y = pmadd(y, r, p8f_cephes_exp_p1); - y = pmadd(y, r, p8f_cephes_exp_p2); - y = pmadd(y, r, p8f_cephes_exp_p3); - y = pmadd(y, r, p8f_cephes_exp_p4); - y = pmadd(y, r, p8f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p8f_1); - - // Build emm0 = 2^m. - Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); - emm0 = pshiftleft(emm0, 23); - - // Return 2^m * exp(r). - return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); + return pexp_float(_x); } // Hyperbolic Tangent function. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -ptanh(const Packet8f& x) { - return internal::generic_fast_tanh_float(x); +ptanh(const Packet8f& _x) { + return internal::generic_fast_tanh_float(_x); } +// Exponential function for doubles. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d pexp(const Packet4d& _x) { - Packet4d x = _x; - - _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); - _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - - Packet4d tmp, fx; - - // clamp x - x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); - // Express exp(x) as exp(g + n*log(2)). - fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); - - // Get the integer modulus of log(2), i.e. the "n" described above. - fx = _mm256_floor_pd(fx); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - tmp = pmul(fx, p4d_cephes_exp_C1); - Packet4d z = pmul(fx, p4d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet4d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet4d px = p4d_cephes_exp_p0; - px = pmadd(px, x2, p4d_cephes_exp_p1); - px = pmadd(px, x2, p4d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet4d qx = p4d_cephes_exp_q0; - qx = pmadd(qx, x2, p4d_cephes_exp_q1); - qx = pmadd(qx, x2, p4d_cephes_exp_q2); - qx = pmadd(qx, x2, p4d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm256_div_pd(px, psub(qx, px)); - x = pmadd(p4d_2, x, p4d_1); - - // Build e=2^n by constructing the exponents in a 128-bit vector and - // shifting them to where they belong in double-precision values. - __m128i emm0 = _mm256_cvtpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i lo = _mm_slli_epi64(emm0, 52); - __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); - __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); - e = _mm256_insertf128_si256(e, hi, 1); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); + return pexp_double(_x); } // Functions for sqrt. @@ -362,37 +97,39 @@ pexp(const Packet4d& _x) { // For detail see here: http://www.beyond3d.com/content/articles/8/ #if EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -psqrt(const Packet8f& _x) { - Packet8f half = pmul(_x, pset1(.5f)); - Packet8f denormal_mask = _mm256_and_ps( - _mm256_cmp_ps(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ)); +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f psqrt(const Packet8f& _x) { + Packet8f minus_half_x = pmul(_x, pset1(-0.5f)); + Packet8f denormal_mask = pandnot( + pcmp_lt(_x, pset1((std::numeric_limits::min)())), + pcmp_lt(_x, pzero(_x))); // Compute approximate reciprocal sqrt. Packet8f x = _mm256_rsqrt_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); // Flush results for denormals to zero. - return _mm256_andnot_ps(denormal_mask, pmul(_x,x)); + return pandnot(pmul(_x,x), denormal_mask); } + #else + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f psqrt(const Packet8f& x) { - return _mm256_sqrt_ps(x); +Packet8f psqrt(const Packet8f& _x) { + return _mm256_sqrt_ps(_x); } + #endif + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d psqrt(const Packet4d& x) { - return _mm256_sqrt_pd(x); +Packet4d psqrt(const Packet4d& _x) { + return _mm256_sqrt_pd(_x); } -#if EIGEN_FAST_MATH +#if EIGEN_FAST_MATH template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); @@ -401,36 +138,88 @@ Packet8f prsqrt(const Packet8f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). - Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); - Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); - Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); - Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), - _mm256_and_ps(zero_mask, p8f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm256_or_ps(x, infs_and_nans); + Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f inf_mask = _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ); + Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask); + + // Compute an approximate result using the rsqrt intrinsic. + Packet8f y_approx = _mm256_rsqrt_ps(_x); + + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five)); + + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& x) { +Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x)); + return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x)); } #endif template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d prsqrt(const Packet4d& x) { +Packet4d prsqrt(const Packet4d& _x) { _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); - return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x)); + return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x)); } +F16_PACKET_FUNCTION(Packet8f, Packet8h, psin) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp) +F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh) +F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt) +F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt) + +template <> +EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) { + Packet8f fexponent; + const Packet8h out = float2half(pfrexp(half2float(a), fexponent)); + exponent = float2half(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) { + return float2half(pldexp(half2float(a), half2float(exponent))); +} + +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt) + +template <> +EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) { + Packet8f fexponent; + const Packet8bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); +} } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/AVX/PacketMath.h b/inst/include/Eigen/src/Core/arch/AVX/PacketMath.h index 923a124b..7fc32fd7 100644 --- a/inst/include/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/inst/include/Eigen/src/Core/arch/AVX/PacketMath.h @@ -18,11 +18,11 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -31,10 +31,14 @@ namespace internal { typedef __m256 Packet8f; typedef __m256i Packet8i; typedef __m256d Packet4d; +typedef eigen_packet_wrapper<__m128i, 2> Packet8h; +typedef eigen_packet_wrapper<__m128i, 3> Packet8bf; template<> struct is_arithmetic<__m256> { enum { value = true }; }; template<> struct is_arithmetic<__m256i> { enum { value = true }; }; template<> struct is_arithmetic<__m256d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ const Packet8f p8f_##NAME = pset1(X) @@ -58,21 +62,28 @@ template<> struct packet_traits : default_packet_traits enum { Vectorizable = 1, AlignedOnScalar = 1, - size=8, + size = 8, HasHalfPacket = 1, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = 0, - HasLog = 1, - HasExp = 1, + HasCmp = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasNdtri = 1, + HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, + HasRint = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -85,14 +96,104 @@ template<> struct packet_traits : default_packet_traits size=4, HasHalfPacket = 1, + HasCmp = 1, HasDiv = 1, + HasLog = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, + HasRint = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8bf type; + // There is no half-size packet for current Packet8bf. + // TODO: support as SSE path. + typedef Packet8bf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 }; }; #endif @@ -113,14 +214,45 @@ template<> struct packet_traits : default_packet_traits }; */ -template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet8i integer_packet; + typedef uint8_t mask_t; + enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; }; +template<> struct unpacket_traits { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; + +// Helper function for bit packing snippet of low precision comparison. +// It packs the flags from 16x16 to 8x16. +EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) { + return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0), + _mm256_extractf128_si256(_mm256_castps_si256(rf), 1)); +} + template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pset1frombits(unsigned int from) { return _mm256_castsi256_ps(pset1(from)); } +template<> EIGEN_STRONG_INLINE Packet4d pset1frombits(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); } + + +template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); } +template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); } +template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); } + template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } @@ -129,9 +261,27 @@ template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { retur template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_add_epi32(a,b); +#else + __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_sub_epi32(a,b); +#else + __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { @@ -148,7 +298,15 @@ template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } - +template<> EIGEN_STRONG_INLINE Packet8i pmul(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_mullo_epi32(a,b); +#else + const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } @@ -157,7 +315,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, @@ -184,14 +342,112 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& } #endif -template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } + +template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } + + +template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpeq_epi32(a,b); +#else + __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may flip + // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet8f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_ps(b,a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_pd(b,a); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet8f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_ps(b,a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_pd(b,a); +#endif +} -template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_nan(a, b, pmax); +} -template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } -template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet8f print(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4d print(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } @@ -199,17 +455,124 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +template<> EIGEN_STRONG_INLINE Packet8i ptrue(const Packet8i& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + return _mm256_cmpeq_epi32(a,a); +#else + const __m256 b = _mm256_castsi256_ps(a); + return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + const __m256i b = _mm256_castps_si256(a); + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b)); +#else + return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4d ptrue(const Packet4d& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqq has lower latency than the more general vcmppd + const __m256i b = _mm256_castpd_si256(a); + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b)); +#else + return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ); +#endif +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_and_si256(a,b); +#else + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i por(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_or_si256(a,b); +#else + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pxor(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_xor_si256(a,b); +#else + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} -template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_andnot_si256(b,a); +#else + return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a))); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) +{ + const Packet8f mask = pset1frombits(static_cast(0x80000000u)); + const Packet8f prev0dot5 = pset1frombits(static_cast(0x3EFFFFFFu)); + return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} +template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) +{ + const Packet4d mask = pset1frombits(static_cast(0x8000000000000000ull)); + const Packet4d prev0dot5 = pset1frombits(static_cast(0x3FDFFFFFFFFFFFFFull)); + return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) +{ return _mm256_blendv_ps(b,a,mask); } +template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) +{ return _mm256_blendv_pd(b,a,mask); } + +template EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srai_epi32(a, N); +#else + __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srli_epi32(a, N); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(a, N); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } @@ -219,6 +582,14 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EI template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from, uint8_t umask) { + Packet8i mask = _mm256_set1_epi8(static_cast(umask)); + const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = por(mask, bit_mask); + mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask); +} + // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { @@ -226,7 +597,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); // return _mm256_unpacklo_ps(tmp,tmp); - + // _mm256_insertf128_ps is very slow on Haswell, thus: Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); // mimic an "inplace" permutation of the lower 128bits using a blend @@ -256,6 +627,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { + Packet8i mask = _mm256_set1_epi8(static_cast(umask)); + const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = por(mask, bit_mask); + mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +} + // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) @@ -354,47 +733,66 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) return _mm256_and_pd(a,mask); } -// preduxp should be ok -// FIXME: why is this ok? why isn't the simply implementation working as expected? -template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) -{ - __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); - __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); - __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); - __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); - - __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); - __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); - __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); - __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); +template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { + return pfrexp_generic(a,exponent); +} - __m256 sum1 = _mm256_add_ps(perm1, hsum5); - __m256 sum2 = _mm256_add_ps(perm2, hsum6); - __m256 sum3 = _mm256_add_ps(perm3, hsum7); - __m256 sum4 = _mm256_add_ps(perm4, hsum8); +// Extract exponent without existence of Packet4l. +template<> +EIGEN_STRONG_INLINE +Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) { + const Packet4d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask)); +#ifdef EIGEN_VECTORIZE_AVX2 + a_expo = _mm256_srli_epi64(a_expo, 52); + __m128i lo = _mm256_extractf128_si256(a_expo, 0); + __m128i hi = _mm256_extractf128_si256(a_expo, 1); +#else + __m128i lo = _mm256_extractf128_si256(a_expo, 0); + __m128i hi = _mm256_extractf128_si256(a_expo, 1); + lo = _mm_srli_epi64(lo, 52); + hi = _mm_srli_epi64(hi, 52); +#endif + Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3)); + Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3)); + Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0); + exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1); + return exponent; +} - __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); - return final; +template<> EIGEN_STRONG_INLINE Packet4d pfrexp(const Packet4d& a, Packet4d& exponent) { + return pfrexp_generic(a, exponent); } -template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) -{ - Packet4d tmp0, tmp1; - tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); +template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, const Packet8f& exponent) { + return pldexp_generic(a, exponent); +} - return _mm256_blend_pd(tmp0, tmp1, 0xC); +template<> EIGEN_STRONG_INLINE Packet4d pldexp(const Packet4d& a, const Packet4d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet4d max_exponent = pset1(2099.0); + const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply. + const Packet4i bias = pset1(1023); + Packet4i b = parithmetic_shift_right<2>(e); // floor(e/4) + + // 2^b + Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + Packet4i lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + Packet4d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + + // 2^(e - 3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + out = pmul(out, c); // a * 2^e + return out; } template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) @@ -406,7 +804,7 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); } -template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) +template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); } @@ -450,93 +848,16 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) +// { +// return _mm256_movemask_ps(x)==0xFF; +// } -template -struct palign_impl +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) { - static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) - { - if (Offset==1) - { - first = _mm256_blend_ps(first, second, 1); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0x88); - } - else if (Offset==2) - { - first = _mm256_blend_ps(first, second, 3); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xcc); - } - else if (Offset==3) - { - first = _mm256_blend_ps(first, second, 7); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xee); - } - else if (Offset==4) - { - first = _mm256_blend_ps(first, second, 15); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); - } - else if (Offset==5) - { - first = _mm256_blend_ps(first, second, 31); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0x88); - } - else if (Offset==6) - { - first = _mm256_blend_ps(first, second, 63); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xcc); - } - else if (Offset==7) - { - first = _mm256_blend_ps(first, second, 127); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xee); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) - { - if (Offset==1) - { - first = _mm256_blend_pd(first, second, 1); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 0xA); - } - else if (Offset==2) - { - first = _mm256_blend_pd(first, second, 3); - first = _mm256_permute2f128_pd(first, first, 1); - } - else if (Offset==3) - { - first = _mm256_blend_pd(first, second, 7); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 5); - } - } -}; + return _mm256_movemask_ps(x)!=0; +} EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -610,24 +931,640 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); } -template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) +// Packet math for Eigen::half + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { + return _mm_set1_epi16(numext::bit_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { + return numext::bit_cast(static_cast(_mm_extract_epi16(from, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { + return _mm_load_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { + return _mm_loadu_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploaddup(const Eigen::half* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + const numext::uint16_t c = numext::bit_cast(from[2]); + const numext::uint16_t d = numext::bit_cast(from[3]); + return _mm_set_epi16(d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad(const Eigen::half* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + return _mm_set_epi16(b, b, b, b, a, a, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { + return _mm_cmpeq_epi32(a, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) { + const __m128i sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_andnot_si128(sign_mask, a); +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + const numext::uint16_t s0 = numext::bit_cast(Eigen::half(aux[0])); + const numext::uint16_t s1 = numext::bit_cast(Eigen::half(aux[1])); + const numext::uint16_t s2 = numext::bit_cast(Eigen::half(aux[2])); + const numext::uint16_t s3 = numext::bit_cast(Eigen::half(aux[3])); + const numext::uint16_t s4 = numext::bit_cast(Eigen::half(aux[4])); + const numext::uint16_t s5 = numext::bit_cast(Eigen::half(aux[5])); + const numext::uint16_t s6 = numext::bit_cast(Eigen::half(aux[6])); + const numext::uint16_t s7 = numext::bit_cast(Eigen::half(aux[7])); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmin(const Packet8h& a, + const Packet8h& b) { + return float2half(pmin(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmax(const Packet8h& a, + const Packet8h& b) { + return float2half(pmax(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h plset(const half& a) { + return float2half(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { + // in some cases Packet4i is a wrapper around __m128i, so we either need to + // cast to Packet4i to directly call the intrinsics as below: + return _mm_or_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { + return _mm_xor_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { + return _mm_and_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { + return _mm_andnot_si128(b,a); +} + +template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { + return _mm_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8h pround(const Packet8h& a) { + return float2half(pround(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h print(const Packet8h& a) { + return float2half(print(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pceil(const Packet8h& a) { + return float2half(pceil(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pfloor(const Packet8h& a) { + return float2half(pfloor(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_eq(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_le(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_lt(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + Packet8h sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_xor_si128(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = psub(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pdiv(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) { - return _mm256_blend_ps(a,pset1(b),1); + const numext::uint16_t s0 = numext::bit_cast(from[0*stride]); + const numext::uint16_t s1 = numext::bit_cast(from[1*stride]); + const numext::uint16_t s2 = numext::bit_cast(from[2*stride]); + const numext::uint16_t s3 = numext::bit_cast(from[3*stride]); + const numext::uint16_t s4 = numext::bit_cast(from[4*stride]); + const numext::uint16_t s5 = numext::bit_cast(from[5*stride]); + const numext::uint16_t s6 = numext::bit_cast(from[6*stride]); + const numext::uint16_t s7 = numext::bit_cast(from[7*stride]); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); } -template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) { - return _mm256_blend_pd(a,pset1(b),1); + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); } -template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) { - return _mm256_blend_ps(a,pset1(b),(1<<7)); + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm_shuffle_epi8(a,m); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + __m128i e = kernel.packet[4]; + __m128i f = kernel.packet[5]; + __m128i g = kernel.packet[6]; + __m128i h = kernel.packet[7]; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0] = a0b0c0d0e0f0g0h0; + kernel.packet[1] = a1b1c1d1e1f1g1h1; + kernel.packet[2] = a2b2c2d2e2f2g2h2; + kernel.packet[3] = a3b3c3d3e3f3g3h3; + kernel.packet[4] = a4b4c4d4e4f4g4h4; + kernel.packet[5] = a5b5c5d5e5f5g5h5; + kernel.packet[6] = a6b6c6d6e6f6g6h6; + kernel.packet[7] = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + +// BFloat16 implementation. + +EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + __m256i extend = _mm256_cvtepu16_epi32(a); + return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16)); +#else + __m128i lo = _mm_cvtepu16_epi32(a); + __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8)); + __m128i lo_shift = _mm_slli_epi32(lo, 16); + __m128i hi_shift = _mm_slli_epi32(hi, 16); + return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1)); +#endif } -template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) +// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm. +EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { + Packet8bf r; + + __m256i input = _mm256_castps_si256(a); + +#ifdef EIGEN_VECTORIZE_AVX2 + // uint32_t lsb = (input >> 16); + __m256i t = _mm256_srli_epi32(input, 16); + // uint32_t lsb = lsb & 1; + t = _mm256_and_si256(t, _mm256_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff)); + // input += rounding_bias; + t = _mm256_add_epi32(t, input); + // input = input >> 16; + t = _mm256_srli_epi32(t, 16); + // Check NaN before converting back to bf16 + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m256i nan = _mm256_set1_epi32(0x7fc0); + t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask)); + // output = numext::bit_cast(input); + return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), + _mm256_extractf128_si256(t, 1)); +#else + // uint32_t lsb = (input >> 16); + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16); + // uint32_t lsb = lsb & 1; + lo = _mm_and_si128(lo, _mm_set1_epi32(1)); + hi = _mm_and_si128(hi, _mm_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff)); + hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff)); + // input += rounding_bias; + lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0)); + hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1)); + // input = input >> 16; + lo = _mm_srli_epi32(lo, 16); + hi = _mm_srli_epi32(hi, 16); + // Check NaN before converting back to bf16 + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m128i nan = _mm_set1_epi32(0x7fc0); + lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask))); + hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1))); + // output = numext::bit_cast(input); + return _mm_packus_epi32(lo, hi); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8bf pset1(const bfloat16& from) { + return _mm_set1_epi16(numext::bit_cast(from)); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& from) { + return numext::bit_cast(static_cast(_mm_extract_epi16(from, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pload(const bfloat16* from) { + return _mm_load_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ploadu(const bfloat16* from) { + return _mm_loadu_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet8bf& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet8bf& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet8bf +ploaddup(const bfloat16* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + const numext::uint16_t c = numext::bit_cast(from[2]); + const numext::uint16_t d = numext::bit_cast(from[3]); + return _mm_set_epi16(d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf +ploadquad(const bfloat16* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + return _mm_set_epi16(b, b, b, b, a, a, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) { + return _mm_cmpeq_epi32(a, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { + const __m128i sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_andnot_si128(sign_mask, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pmin(const Packet8bf& a, + const Packet8bf& b) { + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pmax(const Packet8bf& a, + const Packet8bf& b) { + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf plset(const bfloat16& a) { + return F32ToBf16(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) { + return _mm_or_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) { + return _mm_xor_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) { + return _mm_and_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) { + return _mm_andnot_si128(b,a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) { + return _mm_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pround(const Packet8bf& a) { - return _mm256_blend_pd(a,pset1(b),(1<<3)); + return F32ToBf16(pround(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf print(const Packet8bf& a) { + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pceil(const Packet8bf& a) { + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pfloor(const Packet8bf& a) { + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) { + Packet8bf sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_xor_si128(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf padd(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmul(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pdiv(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + + +template<> EIGEN_STRONG_INLINE Packet8bf pgather(const bfloat16* from, Index stride) +{ + const numext::uint16_t s0 = numext::bit_cast(from[0*stride]); + const numext::uint16_t s1 = numext::bit_cast(from[1*stride]); + const numext::uint16_t s2 = numext::bit_cast(from[2*stride]); + const numext::uint16_t s3 = numext::bit_cast(from[3*stride]); + const numext::uint16_t s4 = numext::bit_cast(from[4*stride]); + const numext::uint16_t s5 = numext::bit_cast(from[5*stride]); + const numext::uint16_t s6 = numext::bit_cast(from[6*stride]); + const numext::uint16_t s7 = numext::bit_cast(from[7*stride]); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); +} + +template<> EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +{ + EIGEN_ALIGN32 bfloat16 aux[8]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) { + return static_cast(predux(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { + return static_cast(predux_max(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { + return static_cast(predux_min(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) { + return static_cast(predux_mul(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm_shuffle_epi8(a,m); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + __m128i e = kernel.packet[4]; + __m128i f = kernel.packet[5]; + __m128i g = kernel.packet[6]; + __m128i h = kernel.packet[7]; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + + __m128i ab_03 = _mm_unpacklo_epi16(a, b); + __m128i cd_03 = _mm_unpacklo_epi16(c, d); + __m128i ab_47 = _mm_unpackhi_epi16(a, b); + __m128i cd_47 = _mm_unpackhi_epi16(c, d); + + kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03); + kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03); + kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47); + kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47); } } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/AVX/TypeCasting.h b/inst/include/Eigen/src/Core/arch/AVX/TypeCasting.h index 83bfdc60..d507fb67 100644 --- a/inst/include/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/inst/include/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -35,15 +35,79 @@ struct type_casting_traits { }; +#ifndef EIGEN_VECTORIZE_AVX512 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +#endif // EIGEN_VECTORIZE_AVX512 template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { - return _mm256_cvtps_epi32(a); + return _mm256_cvttps_epi32(a); } template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { return _mm256_cvtepi32_ps(a); } +template<> EIGEN_STRONG_INLINE Packet8i preinterpret(const Packet8f& a) { + return _mm256_castps_si256(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Packet8i& a) { + return _mm256_castsi256_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { + return half2float(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8bf& a) { + return Bf16ToF32(a); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { + return float2half(a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8f& a) { + return F32ToBf16(a); +} + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/arch/AVX512/Complex.h b/inst/include/Eigen/src/Core/arch/AVX512/Complex.h new file mode 100644 index 00000000..49c72b3f --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/AVX512/Complex.h @@ -0,0 +1,422 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX512_H +#define EIGEN_COMPLEX_AVX512_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet8cf +{ + EIGEN_STRONG_INLINE Packet8cf() {} + EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {} + __m512 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet8cf type; + typedef Packet4cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet4cf half; + typedef Packet16f as_real; + enum { + size = 8, + alignment=unpacket_traits::alignment, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) +{ + return Packet8cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) +{ + const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000, + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet8cf(pxor(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) +{ + __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); } + +template <> +EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { + __m512 eq = pcmp_eq(a.v, b.v); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) +{ + return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploaddup((const double*)(const void*)from )) ); +} +template<> EIGEN_STRONG_INLINE Packet8cf ploadquad(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploadquad((const double*)(const void*)from )) ); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather, Packet8cf>(const std::complex* from, Index stride) +{ + return Packet8cf(_mm512_castpd_ps(pgather((const double*)(const void*)from, stride))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet8cf>(std::complex* to, const Packet8cf& from, Index stride) +{ + pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet8cf& a) +{ + return pfirst(Packet2cf(_mm512_castps512_ps128(a.v))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { + return Packet8cf(_mm512_castsi512_ps( + _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), + _mm512_castps_si512(a.v)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) +{ + return predux(padd(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) +{ + return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); +} + +template <> +EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) { + __m256 lane0 = extract256<0>(a.v); + __m256 lane1 = extract256<1>(a.v); + __m256 res = _mm256_add_ps(lane0, lane1); + return Packet4cf(res); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) + +template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) +{ + Packet8cf num = pmul(a, pconj(b)); + __m512 tmp = _mm512_mul_ps(b.v, b.v); + __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1); + __m512 denom = _mm512_add_ps(tmp, tmp2); + return Packet8cf(_mm512_div_ps(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& x) +{ + return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} + +//---------- double ---------- +struct Packet4cd +{ + EIGEN_STRONG_INLINE Packet4cd() {} + EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {} + __m512d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cd type; + typedef Packet2cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cd half; + typedef Packet8d as_real; + enum { + size = 4, + alignment = unpacket_traits::alignment, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4cd padd(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd psub(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) +{ + const __m512d mask = _mm512_castsi512_pd( + _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0, + 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet4cd(pxor(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) +{ + __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0); + __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF); + __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55); + __m512d odd = _mm512_mul_pd(tmp2, tmp3); + return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); } + +template <> +EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { + __m512d eq = pcmp_eq(a.v, b.v); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) +{ + #ifdef EIGEN_VECTORIZE_AVX512DQ + return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); + #else + return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { + return Packet4cd(_mm512_insertf64x4( + _mm512_castpd256_pd512(ploaddup(from).v), ploaddup(from+1).v, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather, Packet4cd>(const std::complex* from, Index stride) +{ + return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512( + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+0*stride).v), ploadu(from+1*stride).v,1)), + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+2*stride).v), ploadu(from+3*stride).v,1), 1)); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cd>(std::complex* to, const Packet4cd& from, Index stride) +{ + __m512i fromi = _mm512_castpd_si512(from.v); + double* tod = (double*)(void*)to; + _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); + _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); + _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); + _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cd& a) +{ + __m128d low = extract128<0>(a.v); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { + return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cd& a) +{ + return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cd& a) +{ + return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d) + +template<> EIGEN_STRONG_INLINE Packet4cd pdiv(const Packet4cd& a, const Packet4cd& b) +{ + Packet4cd num = pmul(a, pconj(b)); + __m512d tmp = _mm512_mul_pd(b.v, b.v); + __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp); + return Packet4cd(_mm512_div_pd(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x) +{ + return Packet4cd(_mm512_permute_pd(x.v,0x55)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v); + pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v); + pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v); + pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); + kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]); + kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]); + kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]); + kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1] + __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3] + __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1] + __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3] + + kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3] + kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2] + kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1] + kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0] +} + +template<> EIGEN_STRONG_INLINE Packet4cd psqrt(const Packet4cd& a) { + return psqrt_complex(a); +} + +template<> EIGEN_STRONG_INLINE Packet8cf psqrt(const Packet8cf& a) { + return psqrt_complex(a); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX512_H diff --git a/inst/include/Eigen/src/Core/arch/AVX512/MathFunctions.h b/inst/include/Eigen/src/Core/arch/AVX512/MathFunctions.h index b259c1e1..6fd726d2 100644 --- a/inst/include/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/inst/include/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -15,13 +15,13 @@ namespace Eigen { namespace internal { // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ - const Packet16f p16f_##NAME = (__m512)pset1(X) + const Packet16f p16f_##NAME = preinterpret(pset1(X)) #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ const Packet8d p8d_##NAME = pset1(X) @@ -29,108 +29,41 @@ namespace internal { #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) +#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \ + const Packet16bf p16bf_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \ + const Packet16bf p16bf_##NAME = preinterpret(pset1(X)) -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -#if defined(EIGEN_VECTORIZE_AVX512DQ) template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f plog(const Packet16f& _x) { - Packet16f x = _x; - _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); - - // invalid_mask is set to true when x is NaN - __mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); - __mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p16f_min_norm_pos); - - // Extract the shifted exponents. - Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23)); - Packet16f e = _mm512_sub_ps(emm0, p16f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm512_and_ps(x, p16f_inv_mant_mask); - x = _mm512_or_ps(x, p16f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); - Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x); - x = psub(x, p16f_1); - e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1)); - x = padd(x, tmp); - - Packet16f x2 = pmul(x, x); - Packet16f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet16f y, y1, y2; - y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1); - y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4); - y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7); - y = pmadd(y, x, p16f_cephes_log_p2); - y1 = pmadd(y1, x, p16f_cephes_log_p5); - y2 = pmadd(y2, x, p16f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p16f_cephes_log_q1); - tmp = pmul(x2, p16f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p16f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ); - // Filter out invalid inputs, i.e.: - // - negative arg will be NAN, - // - 0 will be -INF. - // - +INF will be +INF - return _mm512_mask_blend_ps(iszero_mask, - _mm512_mask_blend_ps(invalid_mask, - _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf), - p16f_nan), - p16f_minus_inf); + return plog_float(_x); } -#endif +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +plog(const Packet8d& _x) { + return plog_double(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, plog) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog) + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +plog2(const Packet16f& _x) { + return plog2_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +plog2(const Packet8d& _x) { + return plog2_double(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2) // Exponential function. Works by writing "x = m*log(2) + r" where // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then @@ -166,17 +99,17 @@ pexp(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x); Packet16f r2 = pmul(r, r); + Packet16f r3 = pmul(r2, r); - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet16f y = p16f_cephes_exp_p0; - y = pmadd(y, r, p16f_cephes_exp_p1); - y = pmadd(y, r, p16f_cephes_exp_p2); - y = pmadd(y, r, p16f_cephes_exp_p3); - y = pmadd(y, r, p16f_cephes_exp_p4); - y = pmadd(y, r, p16f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p16f_1); + // Evaluate the polynomial approximant,improved by instruction-level parallelism. + Packet16f y, y1, y2; + y = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1); + y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4); + y2 = padd(r, p16f_1); + y = pmadd(y, r, p16f_cephes_exp_p2); + y1 = pmadd(y1, r, p16f_cephes_exp_p5); + y = pmadd(y, r3, y1); + y = pmadd(y, r2, y2); // Build emm0 = 2^m. Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127)); @@ -186,74 +119,40 @@ pexp(const Packet16f& _x) { return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); } -/*template <> +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d pexp(const Packet8d& _x) { - Packet8d x = _x; - - _EIGEN_DECLARE_CONST_Packet8d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet8d(2, 2.0); - - _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6); - - // clamp x - x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo); - - // Express exp(x) as exp(g + n*log(2)). - const Packet8d n = - _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1); - const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2); - x = psub(x, nC1); - x = psub(x, nC2); - - const Packet8d x2 = pmul(x, x); + return pexp_double(_x); +} - // Evaluate the numerator polynomial of the rational interpolant. - Packet8d px = p8d_cephes_exp_p0; - px = pmadd(px, x2, p8d_cephes_exp_p1); - px = pmadd(px, x2, p8d_cephes_exp_p2); - px = pmul(px, x); +F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp) - // Evaluate the denominator polynomial of the rational interpolant. - Packet8d qx = p8d_cephes_exp_q0; - qx = pmadd(qx, x2, p8d_cephes_exp_q1); - qx = pmadd(qx, x2, p8d_cephes_exp_q2); - qx = pmadd(qx, x2, p8d_cephes_exp_q3); +template <> +EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) { + Packet16f fexponent; + const Packet16h out = float2half(pfrexp(half2float(a), fexponent)); + exponent = float2half(fexponent); + return out; +} - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm512_div_pd(px, psub(qx, px)); - x = pmadd(p8d_2, x, p8d_1); +template <> +EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) { + return float2half(pldexp(half2float(a), half2float(exponent))); +} - // Build e=2^n. - const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64( - _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52)); +template <> +EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) { + Packet16f fexponent; + const Packet16bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; +} - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, e), _x); - }*/ +template <> +EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); +} // Functions for sqrt. // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step @@ -304,83 +203,157 @@ template <> EIGEN_STRONG_INLINE Packet16f psqrt(const Packet16f& x) { return _mm512_sqrt_ps(x); } + template <> EIGEN_STRONG_INLINE Packet8d psqrt(const Packet8d& x) { return _mm512_sqrt_pd(x); } #endif -// Functions for rsqrt. -// Almost identical to the sqrt routine, just leave out the last multiplication -// and fill in NaN/Inf where needed. Note that this function only exists as an -// iterative version for doubles since there is no instruction for diretly -// computing the reciprocal square root in AVX-512. -#ifdef EIGEN_FAST_MATH +F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt) + +// prsqrt for float. +#if defined(EIGEN_VECTORIZE_AVX512ER) + +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + return _mm512_rsqrt28_ps(x); +} +#elif EIGEN_FAST_MATH + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f prsqrt(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); Packet16f neg_half = pmul(_x, p16f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); - - // Fill in NaNs and Infs for the negative/zero entries. - __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); - Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + // Identity infinite, negative and denormal arguments. + __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ); + __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ); + __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask; + + // Compute an approximate result using the rsqrt intrinsic, forcing +inf + // for denormals for consistency with AVX and SSE implementations. + Packet16f y_approx = _mm512_rsqrt14_ps(_x); + + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five)); + + // Select the result of the Newton-Raphson step for positive finite arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. + return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx); +} +#else - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f); + return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x)); } +#endif + +F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt) +// prsqrt for double. +#if EIGEN_FAST_MATH template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d prsqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL); _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); Packet8d neg_half = pmul(_x, p8d_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); + // Identity infinite, negative and denormal arguments. + __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ); + __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ); + __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask; + + // Compute an approximate result using the rsqrt intrinsic, forcing +inf + // for denormals for consistency with AVX and SSE implementations. +#if defined(EIGEN_VECTORIZE_AVX512ER) + Packet8d y_approx = _mm512_rsqrt28_pd(_x); +#else + Packet8d y_approx = _mm512_rsqrt14_pd(_x); +#endif + // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the + // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available). + // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number + // of correct digits for each step. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five)); +#if !defined(EIGEN_VECTORIZE_AVX512ER) + y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five)); +#endif + // Select the result of the Newton-Raphson step for positive finite arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. + return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx); +} +#else +template <> +EIGEN_STRONG_INLINE Packet8d prsqrt(const Packet8d& x) { + _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f); + return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x)); +} +#endif + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16f plog1p(const Packet16f& _x) { + return generic_plog1p(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p) - // Fill in NaNs and Infs for the negative/zero entries. - __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); - Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16f pexpm1(const Packet16f& _x) { + return generic_expm1(_x); +} - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); +F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) - // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); +#endif + + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +psin(const Packet16f& _x) { + return psin_float(_x); +} - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +pcos(const Packet16f& _x) { + return pcos_float(_x); } -#elif defined(EIGEN_VECTORIZE_AVX512ER) + template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - return _mm512_rsqrt28_ps(x); +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +ptanh(const Packet16f& _x) { + return internal::generic_fast_tanh_float(_x); } -#endif -#endif + +F16_PACKET_FUNCTION(Packet16f, Packet16h, psin) +F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos) +F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh) + +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh) } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/AVX512/PacketMath.h b/inst/include/Eigen/src/Core/arch/AVX512/PacketMath.h index 000b7762..34d49ab6 100644 --- a/inst/include/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/inst/include/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -31,6 +31,8 @@ namespace internal { typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; +typedef eigen_packet_wrapper<__m256i, 1> Packet16h; +typedef eigen_packet_wrapper<__m256i, 2> Packet16bf; template <> struct is_arithmetic<__m512> { @@ -45,6 +47,51 @@ struct is_arithmetic<__m512d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 + }; +}; + template<> struct packet_traits : default_packet_traits { typedef Packet16f type; @@ -54,16 +101,32 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 16, HasHalfPacket = 1, + + HasAbs = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, HasBlend = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) -#ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, -#endif + HasLog1p = 1, + HasExpm1 = 1, + HasNdtri = 1, + HasBessel = 1, HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, #endif - HasDiv = 1 + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -76,10 +139,17 @@ template<> struct packet_traits : default_packet_traits size = 8, HasHalfPacket = 1, #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) + HasLog = 1, + HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif - HasDiv = 1 + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1 }; }; @@ -100,19 +170,27 @@ struct unpacket_traits { typedef float type; typedef Packet8f half; typedef Packet16i integer_packet; - enum { size = 16, alignment=Aligned64 }; + typedef uint16_t mask_t; + enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true }; }; template <> struct unpacket_traits { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64 }; + enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false }; }; template <> struct unpacket_traits { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false }; +}; + +template<> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet8h half; + enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; template <> @@ -128,6 +206,33 @@ EIGEN_STRONG_INLINE Packet16i pset1(const int& from) { return _mm512_set1_epi32(from); } +template <> +EIGEN_STRONG_INLINE Packet16f pset1frombits(unsigned int from) { + return _mm512_castsi512_ps(_mm512_set1_epi32(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pset1frombits(const numext::uint64_t from) { + return _mm512_castsi512_pd(_mm512_set1_epi64(from)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); } + +template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) { + return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1)); +} +template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) { + return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1); +} +template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) { + return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, + 0, 0, -1, -1, 0, 0, -1, -1)); +} + template <> EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { return _mm512_broadcastss_ps(_mm_load_ps1(from)); @@ -217,7 +322,7 @@ EIGEN_STRONG_INLINE Packet8d pmul(const Packet8d& a, template <> EIGEN_STRONG_INLINE Packet16i pmul(const Packet16i& a, const Packet16i& b) { - return _mm512_mul_epi32(a, b); + return _mm512_mullo_epi32(a, b); } template <> @@ -244,6 +349,24 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, } #endif +template <> +EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, + const Packet16f& a, + const Packet16f& b) { + __mmask16 mask16 = _mm512_cmp_epi32_mask( + _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ); + return _mm512_mask_blend_ps(mask16, a, b); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, + const Packet8d& a, + const Packet8d& b) { + __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), + _mm512_setzero_epi32(), _MM_CMPINT_EQ); + return _mm512_mask_blend_pd(mask8, a, b); +} + template <> EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { @@ -270,6 +393,41 @@ EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, return _mm512_max_pd(b, a); } +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_nan(a, b, pmax); +} + + #ifdef EIGEN_VECTORIZE_AVX512DQ template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); } template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); } @@ -310,6 +468,85 @@ EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) { return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); } +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} +template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); +} + + +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16f print(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet8d print(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); } + +template<> EIGEN_STRONG_INLINE Packet16f pceil(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); } +template<> EIGEN_STRONG_INLINE Packet8d pceil(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); } + +template<> EIGEN_STRONG_INLINE Packet16f pfloor(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); } +template<> EIGEN_STRONG_INLINE Packet8d pfloor(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); } + +template <> +EIGEN_STRONG_INLINE Packet16i ptrue(const Packet16i& /*a*/) { + return _mm512_set1_epi32(0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) { + return _mm512_castsi512_ps(ptrue(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d ptrue(const Packet8d& a) { + return _mm512_castsi512_pd(ptrue(_mm512_castpd_si512(a))); +} + template <> EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, const Packet16i& b) { @@ -411,6 +648,21 @@ EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a,const Packet8d& #endif } +template<> EIGEN_STRONG_INLINE Packet16f pround(const Packet16f& a) +{ + // Work-around for default std::round rounding mode. + const Packet16f mask = pset1frombits(static_cast(0x80000000u)); + const Packet16f prev0dot5 = pset1frombits(static_cast(0x3EFFFFFFu)); + return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} +template<> EIGEN_STRONG_INLINE Packet8d pround(const Packet8d& a) +{ + // Work-around for default std::round rounding mode. + const Packet8d mask = pset1frombits(static_cast(0x8000000000000000ull)); + const Packet8d prev0dot5 = pset1frombits(static_cast(0x3FDFFFFFFFFFFFFFull)); + return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + template EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) { return _mm512_srai_epi32(a, N); } @@ -451,6 +703,12 @@ EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { reinterpret_cast(from)); } +template <> +EIGEN_STRONG_INLINE Packet16f ploadu(const float* from, uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from); +} + // Loads 8 floats from memory a returns the packet // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} template <> @@ -535,6 +793,11 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( reinterpret_cast<__m512i*>(to), from); } +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from, uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from); +} template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, @@ -631,6 +894,59 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { _mm512_set1_epi64(0x7fffffffffffffff))); } +template<> +EIGEN_STRONG_INLINE Packet16f pfrexp(const Packet16f& a, Packet16f& exponent){ + return pfrexp_generic(a, exponent); +} + +// Extract exponent without existence of Packet8l. +template<> +EIGEN_STRONG_INLINE +Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) { + const Packet8d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + #ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)); + #else + return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52))); + #endif +} + +template<> +EIGEN_STRONG_INLINE Packet8d pfrexp(const Packet8d& a, Packet8d& exponent) { + return pfrexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet16f pldexp(const Packet16f& a, const Packet16f& exponent) { + return pldexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, const Packet8d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet8d max_exponent = pset1(2099.0); + const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply. + const Packet8i bias = pset1(1023); + Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4) + + // 2^b + const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); + Packet8i lo = _mm256_slli_epi64(hi, 52); + hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); + Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); + Packet8d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + + // 2^(e - 3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); + lo = _mm256_slli_epi64(hi, 52); + hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); + c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); + out = pmul(out, c); // a * 2^e + return out; +} + #ifdef EIGEN_VECTORIZE_AVX512DQ // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ @@ -686,27 +1002,26 @@ EIGEN_STRONG_INLINE double predux(const Packet8d& a) { } template <> -EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { +EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - return padd(lane0, lane1); + __m256 lane0 = _mm512_extractf32x8_ps(a, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a, 1); + return _mm256_add_ps(lane0, lane1); #else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum0 = padd(lane0, lane2); - Packet4f sum1 = padd(lane1, lane3); + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 sum0 = _mm_add_ps(lane0, lane2); + __m128 sum1 = _mm_add_ps(lane1, lane3); return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); #endif } template <> -EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = padd(lane0, lane1); - return res; +EIGEN_STRONG_INLINE Packet4d predux_half_dowto4(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + return _mm256_add_pd(lane0, lane1); } template <> @@ -777,196 +1092,13 @@ EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); } -template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* vecs) +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) { - EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); - EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); - EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2); - EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3); - EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4); - EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5); - EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6); - EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7); - EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8); - EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9); - EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10); - EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11); - EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12); - EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13); - EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14); - EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15); - - __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0); - __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0); - __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0); - __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0); - - __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); - __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); - __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); - __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - __m256 sum1 = _mm256_add_ps(perm1, hsum5); - __m256 sum2 = _mm256_add_ps(perm2, hsum6); - __m256 sum3 = _mm256_add_ps(perm3, hsum7); - __m256 sum4 = _mm256_add_ps(perm4, hsum8); - - __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); - - hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1); - hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1); - hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1); - hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0)); - - hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0); - hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0); - hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0); - hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0); - - hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1); - hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1); - hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1); - hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0)); - - __m512 final_output; - - EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1); - return final_output; + Packet16i xi = _mm512_castps_si512(x); + __mmask16 tmp = _mm512_test_epi32_mask(xi,xi); + return !_mm512_kortestz(tmp,tmp); } -template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) -{ - Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0); - Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1); - - Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0); - Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1); - - Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0); - Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1); - - Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0); - Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1); - - Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0); - Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1); - - Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0); - Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1); - - Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0); - Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1); - - Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0); - Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1); - - Packet4d tmp0, tmp1; - - tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC); - - tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC)); - - tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC); - - tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); - - __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); - - return _mm512_insertf64x4(final_output, final_1, 1); -} - #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ @@ -1242,61 +1374,927 @@ EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, return _mm512_mask_blend_pd(m, elsePacket, thenPacket); } -template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { - return _mm512_cvttps_epi32(a); +// Packet math for Eigen::half +template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { + return _mm256_set1_epi16(from.x); +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { + return _mm256_load_si256(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { + return _mm256_loadu_si256(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_store_si256((__m256i*)(void*)to, from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_storeu_si256((__m256i*)(void*)to, from); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploaddup(const Eigen::half* from) { + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + unsigned short e = from[4].x; + unsigned short f = from[5].x; + unsigned short g = from[6].x; + unsigned short h = from[7].x; + return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + return _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { + return ptrue(Packet8i(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) { + const __m256i sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_andnot_si256(sign_mask, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmin(const Packet16h& a, + const Packet16h& b) { + return float2half(pmin(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmax(const Packet16h& a, + const Packet16h& b) { + return float2half(pmax(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h plset(const half& a) { + return float2half(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { + // in some cases Packet8i is a wrapper around __m256i, so we need to + // cast to Packet8i to call the correct overload. + return por(Packet8i(a),Packet8i(b)); +} +template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { + return pxor(Packet8i(a),Packet8i(b)); +} +template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { + return pand(Packet8i(a),Packet8i(b)); +} +template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { + return pandnot(Packet8i(a),Packet8i(b)); } -template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { - return _mm512_cvtepi32_ps(a); +template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) { + return _mm256_blendv_epi8(b, a, mask); } -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet16f& first, - const Packet16f& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, - Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, - Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); +template<> EIGEN_STRONG_INLINE Packet16h pround(const Packet16h& a) { + return float2half(pround(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h print(const Packet16h& a) { + return float2half(print(half2float(a))); +} - __m512i second_idx = - _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, - Offset - 5, Offset - 6, Offset - 7, Offset - 8, - Offset - 9, Offset - 10, Offset - 11, Offset - 12, - Offset - 13, Offset - 14, Offset - 15, Offset - 16); +template<> EIGEN_STRONG_INLINE Packet16h pceil(const Packet16h& a) { + return float2half(pceil(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pfloor(const Packet16h& a) { + return float2half(pfloor(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + return Pack32To16(pcmp_eq(af, bf)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) { + return Pack32To16(pcmp_le(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) { + return Pack32To16(pcmp_lt(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) { + return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b))); +} - unsigned short mask = 0xFFFF; - mask <<= (16 - Offset); +template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; } - first = _mm512_permutexvar_ps(first_idx, first); - Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); - first = _mm512_mask_blend_ps(mask, first, tmp); +template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { + Packet16h sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_xor_si256(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = psub(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pdiv(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h predux_half_dowto4(const Packet16h& a) { + Packet8h lane0 = _mm256_extractf128_si256(a, 0); + Packet8h lane1 = _mm256_extractf128_si256(a, 1); + return padd(lane0, lane1); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet16h& a) { + Packet16f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet16h& a) { + Packet16f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux_mul(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)), + _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1); +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) +{ + return _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); +} + +template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; + to[stride*8] = aux[8]; + to[stride*9] = aux[9]; + to[stride*10] = aux[10]; + to[stride*11] = aux[11]; + to[stride*12] = aux[12]; + to[stride*13] = aux[13]; + to[stride*14] = aux[14]; + to[stride*15] = aux[15]; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0]; + __m256i b = kernel.packet[1]; + __m256i c = kernel.packet[2]; + __m256i d = kernel.packet[3]; + __m256i e = kernel.packet[4]; + __m256i f = kernel.packet[5]; + __m256i g = kernel.packet[6]; + __m256i h = kernel.packet[7]; + __m256i i = kernel.packet[8]; + __m256i j = kernel.packet[9]; + __m256i k = kernel.packet[10]; + __m256i l = kernel.packet[11]; + __m256i m = kernel.packet[12]; + __m256i n = kernel.packet[13]; + __m256i o = kernel.packet[14]; + __m256i p = kernel.packet[15]; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0] = a_p_0; + kernel.packet[1] = a_p_1; + kernel.packet[2] = a_p_2; + kernel.packet[3] = a_p_3; + kernel.packet[4] = a_p_4; + kernel.packet[5] = a_p_5; + kernel.packet[6] = a_p_6; + kernel.packet[7] = a_p_7; + kernel.packet[8] = a_p_8; + kernel.packet[9] = a_p_9; + kernel.packet[10] = a_p_a; + kernel.packet[11] = a_p_b; + kernel.packet[12] = a_p_c; + kernel.packet[13] = a_p_d; + kernel.packet[14] = a_p_e; + kernel.packet[15] = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + pstore(in[4], kernel.packet[4]); + pstore(in[5], kernel.packet[5]); + pstore(in[6], kernel.packet[6]); + pstore(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; } } -}; -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, - Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); - - __m512i second_idx = _mm512_set_epi32( - 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, - Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); - - unsigned char mask = 0xFF; - mask <<= (8 - Offset); - - first = _mm512_permutexvar_pd(first_idx, first); - Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); - first = _mm512_mask_blend_pd(mask, first, tmp); + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); + kernel.packet[4] = pload(out[4]); + kernel.packet[5] = pload(out[5]); + kernel.packet[6] = pload(out[6]); + kernel.packet[7] = pload(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; } } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + +template <> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16bf type; + typedef Packet8bf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + HasBlend = 0, + HasInsert = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#ifdef EIGEN_VECTORIZE_AVX512DQ + HasLog = 1, // Currently fails test with bad accuracy. + HasLog1p = 1, + HasExpm1 = 1, + HasNdtri = 1, + HasBessel = 1, +#endif + HasExp = 1, + HasSqrt = EIGEN_FAST_MATH, + HasRsqrt = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, +#endif + HasCmp = 1, + HasDiv = 1 + }; +}; + +template <> +struct unpacket_traits +{ + typedef bfloat16 type; + enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet8bf half; }; +template <> +EIGEN_STRONG_INLINE Packet16bf pset1(const bfloat16& from) { + return _mm256_set1_epi16(from.value); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet16bf& from) { + bfloat16 t; + t.value = static_cast(_mm256_extract_epi16(from, 0)); + return t; +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pload(const bfloat16* from) { + return _mm256_load_si256(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf ploadu(const bfloat16* from) { + return _mm256_loadu_si256(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE void pstore(bfloat16* to, + const Packet16bf& from) { + _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, + const Packet16bf& from) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet16bf +ploaddup(const bfloat16* from) { + Packet16bf r; + unsigned short a = from[0].value; + unsigned short b = from[1].value; + unsigned short c = from[2].value; + unsigned short d = from[3].value; + unsigned short e = from[4].value; + unsigned short f = from[5].value; + unsigned short g = from[6].value; + unsigned short h = from[7].value; + return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet16bf +ploadquad(const bfloat16* from) { + Packet16bf r; + unsigned short a = from[0].value; + unsigned short b = from[1].value; + unsigned short c = from[2].value; + unsigned short d = from[3].value; + return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); +} + +EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) { + return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16)); +} + +// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm. +EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { + Packet16bf r; + +#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1) + // Since GCC 10.1 supports avx512bf16 and C style explicit cast + // (C++ static_cast is not supported yet), do converion via intrinsic + // and register path for performance. + r = (__m256i)(_mm512_cvtneps_pbh(a)); + +#else + __m512i t; + __m512i input = _mm512_castps_si512(a); + __m512i nan = _mm512_set1_epi32(0x7fc0); + + // uint32_t lsb = (input >> 16) & 1; + t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff)); + // input += rounding_bias; + t = _mm512_add_epi32(t, input); + // input = input >> 16; + t = _mm512_srli_epi32(t, 16); + + // Check NaN before converting back to bf16 + __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); + + t = _mm512_mask_blend_epi32(mask, nan, t); + // output.value = static_cast(input); + r = _mm512_cvtepi32_epi16(t); +#endif // EIGEN_VECTORIZE_AVX512BF16 + + return r; +} + +template <> +EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) { + return ptrue(a); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) { + return por(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) { + return pxor(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) { + return pand(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, + const Packet16bf& b) { + return pandnot(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask, + const Packet16bf& a, + const Packet16bf& b) { + // Input mask is expected to be all 0/1, handle it with 8-bit + // intrinsic for performance. + return _mm256_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet16bf pround(const Packet16bf& a) +{ + return F32ToBf16(pround(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16bf print(const Packet16bf& a) { + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16bf pceil(const Packet16bf& a) { + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16bf pfloor(const Packet16bf& a) { + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) { + Packet16bf sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_xor_si256(a, sign_mask); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pconj(const Packet16bf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) { + const __m256i sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_andnot_si256(sign_mask, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf padd(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf psub(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmul(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pdiv(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmin(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmax(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf plset(const bfloat16& a) { + return F32ToBf16(plset(static_cast(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4(const Packet16bf& a) { + Packet8bf lane0 = _mm256_extractf128_si256(a, 0); + Packet8bf lane1 = _mm256_extractf128_si256(a, 1); + return padd(lane0, lane1); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& p) { + return static_cast(predux(Bf16ToF32(p))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) { + return static_cast(predux_mul(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) { + return static_cast(predux_min(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) { + return static_cast(predux_max(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) { + __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1, + 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + + Packet16bf res; + // Swap hi and lo first because shuffle is in 128-bit lanes. + res = _mm256_permute2x128_si256(a, a, 1); + // Shuffle 8-bit values in src within 2*128-bit lanes. + return _mm256_shuffle_epi8(res, m); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pgather(const bfloat16* from, + Index stride) { + return _mm256_set_epi16( + from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value, + from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value, + from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value, + from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value); +} + +template <> +EIGEN_STRONG_INLINE void pscatter(bfloat16* to, + const Packet16bf& from, + Index stride) { + EIGEN_ALIGN64 bfloat16 aux[16]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; + to[stride*8] = aux[8]; + to[stride*9] = aux[9]; + to[stride*10] = aux[10]; + to[stride*11] = aux[11]; + to[stride*12] = aux[12]; + to[stride*13] = aux[13]; + to[stride*14] = aux[14]; + to[stride*15] = aux[15]; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0]; + __m256i b = kernel.packet[1]; + __m256i c = kernel.packet[2]; + __m256i d = kernel.packet[3]; + __m256i e = kernel.packet[4]; + __m256i f = kernel.packet[5]; + __m256i g = kernel.packet[6]; + __m256i h = kernel.packet[7]; + __m256i i = kernel.packet[8]; + __m256i j = kernel.packet[9]; + __m256i k = kernel.packet[10]; + __m256i l = kernel.packet[11]; + __m256i m = kernel.packet[12]; + __m256i n = kernel.packet[13]; + __m256i o = kernel.packet[14]; + __m256i p = kernel.packet[15]; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0]; + __m256i b = kernel.packet[1]; + __m256i c = kernel.packet[2]; + __m256i d = kernel.packet[3]; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20); + kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20); + kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31); + kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31); +} } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/AVX512/TypeCasting.h b/inst/include/Eigen/src/Core/arch/AVX512/TypeCasting.h new file mode 100644 index 00000000..33041272 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/AVX512/TypeCasting.h @@ -0,0 +1,89 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_AVX512_H +#define EIGEN_TYPE_CASTING_AVX512_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { + return _mm512_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { + return _mm512_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet16i preinterpret(const Packet16f& a) { + return _mm512_castps_si512(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet16i& a) { + return _mm512_castsi512_ps(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { + return float2half(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16bf& a) { + return Bf16ToF32(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16bf pcast(const Packet16f& a) { + return F32ToBf16(a); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_AVX512_H diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h b/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h index 3e665730..f424f11c 100644 --- a/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h @@ -29,8 +29,54 @@ static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P //---------- float ---------- struct Packet2cf { - EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {} + EIGEN_STRONG_INLINE explicit Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) + { + Packet4f v1, v2; + + // Permute and multiply the real parts of a and b + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + // Get the imaginary parts of a + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + // multiply a_re * b + v1 = vec_madd(v1, b.v, p4f_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(v2, b.v, p4f_ZERO); + v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + // permute back to a proper order + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); + + return Packet2cf(padd(v1, v2)); + } + + EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { + v = pmul(Packet2cf(*this), b).v; + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { + return Packet2cf(*this) *= b; + } + + EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { + return Packet2cf(*this) += b; + } + EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { + return Packet2cf(*this) -= b; + } + EIGEN_STRONG_INLINE Packet2cf operator-(void) const { + return Packet2cf(-v); + } + Packet4f v; }; @@ -38,6 +84,7 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; typedef Packet2cf half; + typedef Packet4f as_real; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -60,7 +107,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -80,16 +127,35 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } +EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex* from0, const std::complex* from1) +{ + Packet4f res0, res1; +#ifdef __VSX__ + __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1)); +#ifdef _BIG_ENDIAN + __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); +#else + __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); +#endif +#else + *reinterpret_cast *>(&res0) = *from0; + *reinterpret_cast *>(&res1) = *from1; + res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI); +#endif + return Packet2cf(res0); +} + template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -100,25 +166,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet4f v1, v2; - - // Permute and multiply the real parts of a and b - v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); - // Get the imaginary parts of a - v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - // multiply a_re * b - v1 = vec_madd(v1, b.v, p4f_ZERO); - // multiply a_im * b and get the conjugate result - v2 = vec_madd(v2, b.v, p4f_ZERO); - v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); - // permute back to a proper order - v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); - - return Packet2cf(padd(v1, v2)); -} - template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v, b.v)); } @@ -128,7 +175,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore((float *)&res, a.v); return res[0]; @@ -149,22 +196,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe return pfirst(Packet2cf(b)); } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - Packet4f b1, b2; -#ifdef _BIG_ENDIAN - b1 = vec_sld(vecs[0].v, vecs[1].v, 8); - b2 = vec_sld(vecs[1].v, vecs[0].v, 8); -#else - b1 = vec_sld(vecs[1].v, vecs[0].v, 8); - b2 = vec_sld(vecs[0].v, vecs[1].v, 8); -#endif - b2 = vec_sld(b2, b2, 8); - b2 = padd(b1, b2); - - return Packet2cf(b2); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { Packet4f b; @@ -175,61 +206,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { -#ifdef _BIG_ENDIAN - first.v = vec_sld(first.v, second.v, 8); -#else - first.v = vec_sld(second.v, first.v, 8); -#endif - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec - Packet2cf res = conj_helper().pmul(a, b); + Packet2cf res = pmul(a, pconj(b)); Packet4f s = pmul(b.v, b.v); return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); } @@ -246,6 +228,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[0].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { + Packet4f eq = reinterpret_cast(vec_cmpeq(a.v,b.v)); + return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV))); +} + #ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { Packet2cf result; @@ -254,12 +241,62 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con } #endif +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) +{ + return psqrt_complex(a); +} + //---------- double ---------- #ifdef __VSX__ struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) + { + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); + v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); + + return Packet1cd(padd(v1, v2)); + } + + EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { + v = pmul(Packet1cd(*this), b).v; + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { + return Packet1cd(*this) *= b; + } + + EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { + return Packet1cd(*this) += b; + } + EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { + return Packet1cd(*this) -= b; + } + EIGEN_STRONG_INLINE Packet1cd operator-(void) const { + return Packet1cd(-v); + } + Packet2d v; }; @@ -267,6 +304,7 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; + typedef Packet2d as_real; enum { Vectorizable = 1, AlignedOnScalar = 0, @@ -286,7 +324,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; }; template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } @@ -296,19 +334,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex< template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index) { - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); + return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index) { - std::complex EIGEN_ALIGN16 af[2]; - pstore >(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; + pstore >(to, from); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } @@ -316,24 +348,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, con template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast(p2ul_CONJ_XOR2))); } -template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) -{ - Packet2d a_re, a_im, v1, v2; - - // Permute and multiply the real parts of a and b - a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); - // Get the imaginary parts of a - a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); - // multiply a_re * b - v1 = vec_madd(a_re, b.v, p2d_ZERO); - // multiply a_im * b and get the conjugate result - v2 = vec_madd(a_im, b.v, p2d_ZERO); - v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); - v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); - - return Packet1cd(padd(v1, v2)); -} - template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); } @@ -345,7 +359,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; @@ -354,59 +368,15 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); } @@ -422,6 +392,23 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); kernel.packet[0].v = tmp; } + +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a)==re(b), im(a)==im(b)] + Packet2d eq = reinterpret_cast(vec_cmpeq(a.v,b.v)); + // Swap real/imag elements in the mask in to get: + // [im(a)==im(b), re(a)==re(b)] + Packet2d eq_swapped = reinterpret_cast(vec_sld(reinterpret_cast(eq), reinterpret_cast(eq), 8)); + // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cd(vec_and(eq, eq_swapped)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) +{ + return psqrt_complex(a); +} + #endif // __VSX__ } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/inst/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h index c5e4bede..3a7a3293 100644 --- a/inst/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/inst/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -9,10 +9,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H @@ -20,180 +16,28 @@ namespace Eigen { namespace internal { -static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); -static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); -static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); -static _EIGEN_DECLARE_CONST_Packet4i(23, 23); - -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - -/* the smallest non denormalized float number */ -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); - -/* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 -*/ -static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - -static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); -static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - -#ifdef __VSX__ -static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); -static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); -static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - -static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); -static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - -#ifdef __POWER8_VECTOR__ -static Packet2l p2l_1023 = { 1023, 1023 }; -static Packet2ul p2ul_52 = { 52, 52 }; -#endif - -#endif - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { - Packet4f x = _x; - - Packet4i emm0; - - /* isvalid_mask is 0 if x < 0 or x is NaN. */ - Packet4ui isvalid_mask = reinterpret_cast(vec_cmpge(x, p4f_ZERO)); - Packet4ui iszero_mask = reinterpret_cast(vec_cmpeq(x, p4f_ZERO)); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = vec_sr(reinterpret_cast(x), - reinterpret_cast(p4i_23)); - - /* keep only the fractional part */ - x = pand(x, p4f_inv_mant_mask); - x = por(x, p4f_half); - - emm0 = psub(emm0, p4i_0x7f); - Packet4f e = padd(vec_ctf(emm0, 0), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = reinterpret_cast(vec_cmplt(x, p4f_cephes_SQRTHF)); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - x = vec_sel(x, p4f_minus_inf, iszero_mask); - x = vec_sel(p4f_minus_nan, x, isvalid_mask); - return x; + return plog_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - Packet4f x = _x; - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - // express exp(x) as exp(g + n*log(2)) - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - - fx = pfloor(fx); - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); + return pexp_float(_x); +} - // build 2^n - emm0 = vec_cts(fx, 0); - emm0 = vec_add(emm0, p4i_0x7f); - emm0 = vec_sl(emm0, reinterpret_cast(p4i_23)); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psin(const Packet4f& _x) +{ + return psin_float(_x); +} - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), - isnumber_mask); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pcos(const Packet4f& _x) +{ + return pcos_float(_x); } #ifndef EIGEN_COMP_CLANG @@ -225,95 +69,19 @@ Packet2d psqrt(const Packet2d& x) return vec_sqrt(x); } -// VSX support varies between different compilers and even different -// versions of the same compiler. For gcc version >= 4.9.3, we can use -// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use -// a slow version that works with older compilers. -// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles -// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 -static inline Packet2l ConvertToPacket2l(const Packet2d& x) { -#if EIGEN_GNUC_AT_LEAST(5, 4) || \ - (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) - return vec_cts(x, 0); // TODO: check clang version. -#else - double tmp[2]; - memcpy(tmp, &x, sizeof(tmp)); - Packet2l l = { static_cast(tmp[0]), - static_cast(tmp[1]) }; - return l; -#endif -} - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) { - Packet2d x = _x; - - Packet2d tmp, fx; - Packet2l emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half); - - fx = pfloor(fx); - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = ConvertToPacket2l(fx); - -#ifdef __POWER8_VECTOR__ - emm0 = vec_add(emm0, p2l_1023); - emm0 = vec_sl(emm0, p2ul_52); -#else - // Code is a bit complex for POWER7. There is actually a - // vec_xxsldi intrinsic but it is not supported by some gcc versions. - // So we shift (52-32) bits and do a word swap with zeros. - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32 - - Packet4i emm04i = reinterpret_cast(emm0); - emm04i = vec_add(emm04i, p4i_1023); - emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); - static const Packet16uc perm = { - 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, - 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; -#ifdef _BIG_ENDIAN - emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); -#else - emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); -#endif - + return pexp_double(_x); +} #endif - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), - isnumber_mask); +// Hyperbolic Tangent function. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& x) { + return internal::generic_fast_tanh_float(x); } -#endif } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProduct.h new file mode 100644 index 00000000..3f79b97d --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -0,0 +1,2937 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com) +// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H +#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H + +#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK +#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1 +#endif + +#include "MatrixProductCommon.h" + +// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX +#if EIGEN_COMP_LLVM +#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) +#ifdef __MMA__ +#define EIGEN_ALTIVEC_MMA_ONLY +#else +#define EIGEN_ALTIVEC_DISABLE_MMA +#endif +#endif +#endif + +#ifdef __has_builtin +#if __has_builtin(__builtin_mma_assemble_acc) + #define ALTIVEC_MMA_SUPPORT +#endif +#endif + +#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #include "MatrixProductMMA.h" +#endif + +/************************************************************************************************** + * TODO * + * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). * + * - Check the possibility of transposing as GETREAL and GETIMAG when needed. * + **************************************************************************************************/ +namespace Eigen { + +namespace internal { + +/************************** + * Constants and typedefs * + **************************/ +template +struct quad_traits +{ + typedef typename packet_traits::type vectortype; + typedef PacketBlock type; + typedef vectortype rhstype; + enum + { + vectorsize = packet_traits::size, + size = 4, + rows = 4 + }; +}; + +template<> +struct quad_traits +{ + typedef Packet2d vectortype; + typedef PacketBlock type; + typedef PacketBlock rhstype; + enum + { + vectorsize = packet_traits::size, + size = 2, + rows = 4 + }; +}; + +// MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out +// to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then +// are responsible to extract from convert between Eigen's and MatrixProduct approach. + +const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3, + 8, 9, 10, 11, + 16, 17, 18, 19, + 24, 25, 26, 27}; + +const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7, + 12, 13, 14, 15, + 20, 21, 22, 23, + 28, 29, 30, 31}; +const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23}; + +//[a,ai],[b,bi] = [ai,bi] +const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31}; + +/********************************************* + * Single precision real and complex packing * + * *******************************************/ + +/** + * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves + * the diagonal real, whatever is below it is copied from the respective upper diagonal element and + * conjugated. There's no PanelMode available for symm packing. + * + * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using + * its respective rank-update instructions. The float32/64 versions are different because at this moment + * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements. + * + * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has + * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main + * reason why packing for complex is broken down into several different parts, also the reason why we endup having a + * float32/64 and complex float32/64 version. + **/ +template +EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) +{ + std::complex v; + if(i < j) + { + v.real( dt(j,i).real()); + v.imag(-dt(j,i).imag()); + } else if(i > j) + { + v.real( dt(i,j).real()); + v.imag( dt(i,j).imag()); + } else { + v.real( dt(i,j).real()); + v.imag((Scalar)0.0); + } + return v; +} + +template +EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) +{ + const Index depth = k2 + rows; + const_blas_data_mapper, Index, StorageOrder> rhs(_rhs, rhsStride); + const Index vectorSize = N*quad_traits::vectorsize; + const Index vectorDelta = vectorSize * rows; + Scalar* blockBf = reinterpret_cast(blockB); + + Index rir = 0, rii, j = 0; + for(; j + vectorSize <= cols; j+=vectorSize) + { + rii = rir + vectorDelta; + + for(Index i = k2; i < depth; i++) + { + for(Index k = 0; k < vectorSize; k++) + { + std::complex v = getAdjointVal(i, j + k, rhs); + + blockBf[rir + k] = v.real(); + blockBf[rii + k] = v.imag(); + } + rir += vectorSize; + rii += vectorSize; + } + + rir += vectorDelta; + } + if (j < cols) + { + rii = rir + ((cols - j) * rows); + + for(Index i = k2; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + std::complex v = getAdjointVal(i, k, rhs); + + blockBf[rir] = v.real(); + blockBf[rii] = v.imag(); + + rir += 1; + rii += 1; + } + } + } +} + +template +EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) +{ + const Index depth = cols; + const_blas_data_mapper, Index, StorageOrder> lhs(_lhs, lhsStride); + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = vectorSize * depth; + Scalar* blockAf = (Scalar *)(blockA); + + Index rir = 0, rii, j = 0; + for(; j + vectorSize <= rows; j+=vectorSize) + { + rii = rir + vectorDelta; + + for(Index i = 0; i < depth; i++) + { + for(Index k = 0; k < vectorSize; k++) + { + std::complex v = getAdjointVal(j+k, i, lhs); + + blockAf[rir + k] = v.real(); + blockAf[rii + k] = v.imag(); + } + rir += vectorSize; + rii += vectorSize; + } + + rir += vectorDelta; + } + + if (j < rows) + { + rii = rir + ((rows - j) * depth); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + std::complex v = getAdjointVal(k, i, lhs); + + blockAf[rir] = v.real(); + blockAf[rii] = v.imag(); + + rir += 1; + rii += 1; + } + } + } +} + +template +EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) +{ + const Index depth = k2 + rows; + const_blas_data_mapper rhs(_rhs, rhsStride); + const Index vectorSize = quad_traits::vectorsize; + + Index ri = 0, j = 0; + for(; j + N*vectorSize <= cols; j+=N*vectorSize) + { + Index i = k2; + for(; i < depth; i++) + { + for(Index k = 0; k < N*vectorSize; k++) + { + if(i <= j+k) + blockB[ri + k] = rhs(j+k, i); + else + blockB[ri + k] = rhs(i, j+k); + } + ri += N*vectorSize; + } + } + + if (j < cols) + { + for(Index i = k2; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + if(k <= i) + blockB[ri] = rhs(i, k); + else + blockB[ri] = rhs(k, i); + ri += 1; + } + } + } +} + +template +EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) +{ + const Index depth = cols; + const_blas_data_mapper lhs(_lhs, lhsStride); + const Index vectorSize = quad_traits::vectorsize; + + Index ri = 0, j = 0; + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + for(; i < depth; i++) + { + for(Index k = 0; k < vectorSize; k++) + { + if(i <= j+k) + blockA[ri + k] = lhs(j+k, i); + else + blockA[ri + k] = lhs(i, j+k); + } + ri += vectorSize; + } + } + + if (j < rows) + { + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + if(i <= k) + blockA[ri] = lhs(k, i); + else + blockA[ri] = lhs(i, k); + ri += 1; + } + } + } +} + +template +struct symm_pack_rhs, Index, nr, StorageOrder> +{ + void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrder> +{ + void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +// *********** symm_pack std::complex *********** + +template +struct symm_pack_rhs, Index, nr, StorageOrder> +{ + void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrder> +{ + void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +// *********** symm_pack float32 *********** +template +struct symm_pack_rhs +{ + void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs +{ + void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +// *********** symm_pack float64 *********** +template +struct symm_pack_rhs +{ + void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs +{ + void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +/** + * PanelMode + * Packing might be called several times before being multiplied by gebp_kernel, this happens because + * on special occasions it fills part of block with other parts of the matrix. Two variables control + * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever + * is going to be the real offset and stride in the future and this is what you should obey. The process + * is to behave as you would with normal packing but leave the start of each part with the correct offset + * and the end as well respecting the real stride the block will have. Gebp is aware of both blocks stride + * and offset and behaves accordingly. + **/ + +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +{ + const Index size = 16 / sizeof(Scalar); + pstore(to + (0 * size), block.packet[0]); + pstore(to + (1 * size), block.packet[1]); + pstore(to + (2 * size), block.packet[2]); + pstore(to + (3 * size), block.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +{ + const Index size = 16 / sizeof(Scalar); + pstore(to + (0 * size), block.packet[0]); + pstore(to + (1 * size), block.packet[1]); +} + +// General template for lhs & rhs complex packing. +template +struct dhs_cpack { + EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth); + Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii; + Scalar* blockAt = reinterpret_cast(blockA); + Index j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + rii = rir + vectorDelta; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + if (UseLhs) { + bload(cblock, lhs, j, i); + } else { + bload(cblock, lhs, i, j); + } + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); + blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32); + blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32); + blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32); + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32); + blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32); + blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32); + blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + blocki.packet[1] = -blocki.packet[1]; + blocki.packet[2] = -blocki.packet[2]; + blocki.packet[3] = -blocki.packet[3]; + } + + if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs))) + { + ptranspose(blockr); + ptranspose(blocki); + } + + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); + + rir += 4*vectorSize; + rii += 4*vectorSize; + } + for(; i < depth; i++) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs))) + { + if (UseLhs) { + cblock.packet[0] = lhs.template loadPacket(j + 0, i); + cblock.packet[1] = lhs.template loadPacket(j + 2, i); + } else { + cblock.packet[0] = lhs.template loadPacket(i, j + 0); + cblock.packet[1] = lhs.template loadPacket(i, j + 2); + } + } else { + std::complex lhs0, lhs1; + if (UseLhs) { + lhs0 = lhs(j + 0, i); + lhs1 = lhs(j + 1, i); + cblock.packet[0] = pload2(&lhs0, &lhs1); + lhs0 = lhs(j + 2, i); + lhs1 = lhs(j + 3, i); + cblock.packet[1] = pload2(&lhs0, &lhs1); + } else { + lhs0 = lhs(i, j + 0); + lhs1 = lhs(i, j + 1); + cblock.packet[0] = pload2(&lhs0, &lhs1); + lhs0 = lhs(i, j + 2); + lhs1 = lhs(i, j + 3); + cblock.packet[1] = pload2(&lhs0, &lhs1); + } + } + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32); + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + } + + pstore(blockAt + rir, blockr.packet[0]); + pstore(blockAt + rii, blocki.packet[0]); + + rir += vectorSize; + rii += vectorSize; + } + + rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); + } + + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + if (UseLhs) { + blockAt[rir] = lhs(k, i).real(); + + if(Conjugate) + blockAt[rii] = -lhs(k, i).imag(); + else + blockAt[rii] = lhs(k, i).imag(); + } else { + blockAt[rir] = lhs(i, k).real(); + + if(Conjugate) + blockAt[rii] = -lhs(i, k).imag(); + else + blockAt[rii] = lhs(i, k).imag(); + } + + rir += 1; + rii += 1; + } + } + } + } +}; + +// General template for lhs & rhs packing. +template +struct dhs_pack{ + EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + Index ri = 0, j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + if(PanelMode) ri += vectorSize*offset; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock block; + + if (UseLhs) { + bload(block, lhs, j, i); + } else { + bload(block, lhs, i, j); + } + if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) + { + ptranspose(block); + } + + storeBlock(blockA + ri, block); + + ri += 4*vectorSize; + } + for(; i < depth; i++) + { + if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) + { + if (UseLhs) { + blockA[ri+0] = lhs(j+0, i); + blockA[ri+1] = lhs(j+1, i); + blockA[ri+2] = lhs(j+2, i); + blockA[ri+3] = lhs(j+3, i); + } else { + blockA[ri+0] = lhs(i, j+0); + blockA[ri+1] = lhs(i, j+1); + blockA[ri+2] = lhs(i, j+2); + blockA[ri+3] = lhs(i, j+3); + } + } else { + Packet lhsV; + if (UseLhs) { + lhsV = lhs.template loadPacket(j, i); + } else { + lhsV = lhs.template loadPacket(i, j); + } + pstore(blockA + ri, lhsV); + } + + ri += vectorSize; + } + + if(PanelMode) ri += vectorSize*(stride - offset - depth); + } + + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + if (UseLhs) { + blockA[ri] = lhs(k, i); + } else { + blockA[ri] = lhs(i, k); + } + ri += 1; + } + } + } + } +}; + +// General template for lhs packing, float64 specialization. +template +struct dhs_pack +{ + EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + Index ri = 0, j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + if(PanelMode) ri += vectorSize*offset; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock block; + if(StorageOrder == RowMajor) + { + block.packet[0] = lhs.template loadPacket(j + 0, i); + block.packet[1] = lhs.template loadPacket(j + 1, i); + + ptranspose(block); + } else { + block.packet[0] = lhs.template loadPacket(j, i + 0); + block.packet[1] = lhs.template loadPacket(j, i + 1); + } + + storeBlock(blockA + ri, block); + + ri += 2*vectorSize; + } + for(; i < depth; i++) + { + if(StorageOrder == RowMajor) + { + blockA[ri+0] = lhs(j+0, i); + blockA[ri+1] = lhs(j+1, i); + } else { + Packet2d lhsV = lhs.template loadPacket(j, i); + pstore(blockA + ri, lhsV); + } + + ri += vectorSize; + } + + if(PanelMode) ri += vectorSize*(stride - offset - depth); + } + + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + blockA[ri] = lhs(k, i); + ri += 1; + } + } + } + } +}; + +// General template for rhs packing, float64 specialization. +template +struct dhs_pack +{ + EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + Index ri = 0, j = 0; + + for(; j + 2*vectorSize <= cols; j+=2*vectorSize) + { + Index i = 0; + + if(PanelMode) ri += offset*(2*vectorSize); + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock block; + if(StorageOrder == ColMajor) + { + PacketBlock block1, block2; + block1.packet[0] = rhs.template loadPacket(i, j + 0); + block1.packet[1] = rhs.template loadPacket(i, j + 1); + block2.packet[0] = rhs.template loadPacket(i, j + 2); + block2.packet[1] = rhs.template loadPacket(i, j + 3); + + ptranspose(block1); + ptranspose(block2); + + pstore(blockB + ri , block1.packet[0]); + pstore(blockB + ri + 2, block2.packet[0]); + pstore(blockB + ri + 4, block1.packet[1]); + pstore(blockB + ri + 6, block2.packet[1]); + } else { + block.packet[0] = rhs.template loadPacket(i + 0, j + 0); //[a1 a2] + block.packet[1] = rhs.template loadPacket(i + 0, j + 2); //[a3 a4] + block.packet[2] = rhs.template loadPacket(i + 1, j + 0); //[b1 b2] + block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] + + storeBlock(blockB + ri, block); + } + + ri += 4*vectorSize; + } + for(; i < depth; i++) + { + if(StorageOrder == ColMajor) + { + blockB[ri+0] = rhs(i, j+0); + blockB[ri+1] = rhs(i, j+1); + + ri += vectorSize; + + blockB[ri+0] = rhs(i, j+2); + blockB[ri+1] = rhs(i, j+3); + } else { + Packet2d rhsV = rhs.template loadPacket(i, j); + pstore(blockB + ri, rhsV); + + ri += vectorSize; + + rhsV = rhs.template loadPacket(i, j + 2); + pstore(blockB + ri, rhsV); + } + ri += vectorSize; + } + + if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth); + } + + if (j < cols) + { + if(PanelMode) ri += offset*(cols - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + blockB[ri] = rhs(i, k); + ri += 1; + } + } + } + } +}; + +// General template for lhs complex packing, float64 specialization. +template +struct dhs_cpack +{ + EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth); + Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii; + double* blockAt = reinterpret_cast(blockA); + Index j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + rii = rir + vectorDelta; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + if(StorageOrder == ColMajor) + { + cblock.packet[0] = lhs.template loadPacket(j, i + 0); //[a1 a1i] + cblock.packet[1] = lhs.template loadPacket(j, i + 1); //[b1 b1i] + + cblock.packet[2] = lhs.template loadPacket(j + 1, i + 0); //[a2 a2i] + cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i] + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2] + blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64); + blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64); + } else { + cblock.packet[0] = lhs.template loadPacket(j + 0, i); //[a1 a1i] + cblock.packet[1] = lhs.template loadPacket(j + 1, i); //[a2 a2i] + + cblock.packet[2] = lhs.template loadPacket(j + 0, i + 1); //[b1 b1i] + cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2] + blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); + blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); + } + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + blocki.packet[1] = -blocki.packet[1]; + } + + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); + + rir += 2*vectorSize; + rii += 2*vectorSize; + } + for(; i < depth; i++) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + cblock.packet[0] = lhs.template loadPacket(j + 0, i); + cblock.packet[1] = lhs.template loadPacket(j + 1, i); + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + } + + pstore(blockAt + rir, blockr.packet[0]); + pstore(blockAt + rii, blocki.packet[0]); + + rir += vectorSize; + rii += vectorSize; + } + + rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); + } + + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + blockAt[rir] = lhs(k, i).real(); + + if(Conjugate) + blockAt[rii] = -lhs(k, i).imag(); + else + blockAt[rii] = lhs(k, i).imag(); + + rir += 1; + rii += 1; + } + } + } + } +}; + +// General template for rhs complex packing, float64 specialization. +template +struct dhs_cpack +{ + EIGEN_STRONG_INLINE void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth); + Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii; + double* blockBt = reinterpret_cast(blockB); + Index j = 0; + + for(; j + 2*vectorSize <= cols; j+=2*vectorSize) + { + Index i = 0; + + rii = rir + vectorDelta; + + for(; i < depth; i++) + { + PacketBlock cblock; + PacketBlock blockr, blocki; + + bload(cblock, rhs, i, j); + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); + blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); + blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + blocki.packet[1] = -blocki.packet[1]; + } + + storeBlock(blockBt + rir, blockr); + storeBlock(blockBt + rii, blocki); + + rir += 2*vectorSize; + rii += 2*vectorSize; + } + + rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta); + } + + if (j < cols) + { + if(PanelMode) rir += (offset*(cols - j - 2*vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (cols - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + blockBt[rir] = rhs(i, k).real(); + + if(Conjugate) + blockBt[rii] = -rhs(i, k).imag(); + else + blockBt[rii] = rhs(i, k).imag(); + + rir += 1; + rii += 1; + } + } + } + } +}; + +/************** + * GEMM utils * + **************/ + +// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm). +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +{ + if(NegativeAccumulate) + { + acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); + acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); + acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); + acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); + } else { + acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); + acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); + acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); + } +} + +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +{ + if(NegativeAccumulate) + { + acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); + } else { + acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + } +} + +template +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) +{ + Packet lhsV = pload(lhs); + + pger_common(acc, lhsV, rhsV); +} + +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +{ +#ifdef _ARCH_PWR9 + lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); +#else + Index i = 0; + do { + lhsV[i] = lhs[i]; + } while (++i < remaining_rows); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +{ + Packet lhsV; + loadPacketRemaining(lhs, lhsV, remaining_rows); + + pger_common(acc, lhsV, rhsV); +} + +// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. +template +EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) +{ + pger_common(accReal, lhsV, rhsV); + if(LhsIsReal) + { + pger_common(accImag, lhsV, rhsVi); + EIGEN_UNUSED_VARIABLE(lhsVi); + } else { + if (!RhsIsReal) { + pger_common(accReal, lhsVi, rhsVi); + pger_common(accImag, lhsV, rhsVi); + } else { + EIGEN_UNUSED_VARIABLE(rhsVi); + } + pger_common(accImag, lhsVi, rhsV); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) +{ + Packet lhsV = ploadLhs(lhs_ptr); + Packet lhsVi; + if(!LhsIsReal) lhsVi = ploadLhs(lhs_ptr_imag); + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + + pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); +} + +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +{ +#ifdef _ARCH_PWR9 + lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); + if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows * sizeof(Scalar)); + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +#else + Index i = 0; + do { + lhsV[i] = lhs_ptr[i]; + if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i]; + } while (++i < remaining_rows); + if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +{ + Packet lhsV, lhsVi; + loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); + + pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); +} + +template +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) +{ + return ploadu(lhs); +} + +// Zero the accumulator on PacketBlock. +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +{ + acc.packet[0] = pset1((Scalar)0); + acc.packet[1] = pset1((Scalar)0); + acc.packet[2] = pset1((Scalar)0); + acc.packet[3] = pset1((Scalar)0); +} + +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +{ + acc.packet[0] = pset1((Scalar)0); +} + +// Scale the PacketBlock vectors by alpha. +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); + acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); + acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); + acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); +} + +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmul(accZ.packet[0], pAlpha); + acc.packet[1] = pmul(accZ.packet[1], pAlpha); + acc.packet[2] = pmul(accZ.packet[2], pAlpha); + acc.packet[3] = pmul(accZ.packet[3], pAlpha); +} + +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmul(accZ.packet[0], pAlpha); +} + +// Complex version of PacketBlock scaling. +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) +{ + bscalec_common(cReal, aReal, bReal); + + bscalec_common(cImag, aImag, bReal); + + pger_common(&cReal, bImag, aImag.packet); + + pger_common(&cImag, bImag, aReal.packet); +} + +template +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) +{ + acc.packet[0] = pand(acc.packet[0], pMask); + acc.packet[1] = pand(acc.packet[1], pMask); + acc.packet[2] = pand(acc.packet[2], pMask); + acc.packet[3] = pand(acc.packet[3], pMask); +} + +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +{ + band(aReal, pMask); + band(aImag, pMask); + + bscalec(aReal, aImag, bReal, bImag, cReal, cImag); +} + +// Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +{ + if (StorageOrder == RowMajor) { + acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); + acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); + acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); + acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); + } else { + acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); + acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); + acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); + acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); + } +} + +// An overload of bload when you have a PacketBLock with 8 vectors. +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +{ + if (StorageOrder == RowMajor) { + acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); + acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); + acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); + acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); + acc.packet[4] = res.template loadPacket(row + 0, col + (N+1)*accCols); + acc.packet[5] = res.template loadPacket(row + 1, col + (N+1)*accCols); + acc.packet[6] = res.template loadPacket(row + 2, col + (N+1)*accCols); + acc.packet[7] = res.template loadPacket(row + 3, col + (N+1)*accCols); + } else { + acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); + acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); + acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); + acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); + acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); + acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); + acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); + acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); + } +} + +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +{ + acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); + acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); +} + +const static Packet4i mask41 = { -1, 0, 0, 0 }; +const static Packet4i mask42 = { -1, -1, 0, 0 }; +const static Packet4i mask43 = { -1, -1, -1, 0 }; + +const static Packet2l mask21 = { -1, 0 }; + +template +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) +{ + if (remaining_rows == 0) { + return pset1(float(0.0)); // Not used + } else { + switch (remaining_rows) { + case 1: return Packet(mask41); + case 2: return Packet(mask42); + default: return Packet(mask43); + } + } +} + +template<> +EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) +{ + if (remaining_rows == 0) { + return pset1(double(0.0)); // Not used + } else { + return Packet2d(mask21); + } +} + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +{ + band(accZ, pMask); + + bscale(acc, accZ, pAlpha); +} + +template +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + pbroadcast4(a, a0, a1, a2, a3); +} + +template<> +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload(a); + a3 = pload(a + 2); + a0 = vec_splat(a1, 0); + a1 = vec_splat(a1, 1); + a2 = vec_splat(a3, 0); + a3 = vec_splat(a3, 1); +} + +// PEEL loop factor. +#define PEEL 7 + +template +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( + const Scalar* &lhs_ptr, + const Scalar* &rhs_ptr, + PacketBlock &accZero, + Index remaining_rows, + Index remaining_cols) +{ + Packet rhsV[1]; + rhsV[0] = pset1(rhs_ptr[0]); + pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += remaining_cols; +} + +template +EIGEN_STRONG_INLINE void gemm_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; + PacketBlock accZero; + + bsetzero(accZero); + + Index remaining_depth = (depth & -accRows); + Index k = 0; + for(; k + PEEL <= remaining_depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + EIGEN_POWER_PREFETCH(lhs_ptr); + for (int l = 0; l < PEEL; l++) { + MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); + } + } + for(; k < remaining_depth; k++) + { + MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); + } + for(; k < depth; k++) + { + Packet rhsV[1]; + rhsV[0] = pset1(rhs_ptr[0]); + pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); + lhs_ptr += remaining_rows; + rhs_ptr += remaining_cols; + } + + accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]); + for(Index i = 0; i < remaining_rows; i++) { + res(row + i, col) += accZero.packet[0][i]; + } +} + +template +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( + const Scalar* &lhs_ptr, + const Scalar* &rhs_ptr, + PacketBlock &accZero, + Index remaining_rows) +{ + Packet rhsV[4]; + pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; +} + +template +EIGEN_STRONG_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; + PacketBlock accZero, acc; + + bsetzero(accZero); + + Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index k = 0; + for(; k + PEEL <= remaining_depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + EIGEN_POWER_PREFETCH(lhs_ptr); + for (int l = 0; l < PEEL; l++) { + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); + } + } + for(; k < remaining_depth; k++) + { + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); + } + + if ((remaining_depth == depth) && (rows >= accCols)) + { + for(Index j = 0; j < 4; j++) { + acc.packet[j] = res.template loadPacket(row, col + j); + } + bscale(acc, accZero, pAlpha, pMask); + res.template storePacketBlock(row, col, acc); + } else { + for(; k < depth; k++) + { + Packet rhsV[4]; + pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; + } + + for(Index j = 0; j < 4; j++) { + accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]); + } + for(Index j = 0; j < 4; j++) { + for(Index i = 0; i < remaining_rows; i++) { + res(row + i, col + j) += accZero.packet[j][i]; + } + } + } +} + +#define MICRO_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_UNROLL_WORK(func, func2, peel) \ + MICRO_UNROLL(func2); \ + func(0,peel) func(1,peel) func(2,peel) func(3,peel) \ + func(4,peel) func(5,peel) func(6,peel) func(7,peel) + +#define MICRO_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr##iter); \ + lhs_ptr##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + } + +#define MICRO_WORK_ONE(iter, peel) \ + if (unroll_factor > iter) { \ + pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ + } + +#define MICRO_TYPE_PEEL4(func, func2, peel) \ + if (PEEL > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ + pbroadcast4(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + MICRO_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_TYPE_PEEL1(func, func2, peel) \ + if (PEEL > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ + rhsV##peel[0] = pset1(rhs_ptr[remaining_cols * peel]); \ + MICRO_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ + func(func1,func2,0); func(func1,func2,1); \ + func(func1,func2,2); func(func1,func2,3); \ + func(func1,func2,4); func(func1,func2,5); \ + func(func1,func2,6); func(func1,func2,7); \ + func(func1,func2,8); func(func1,func2,9); + +#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ + Packet rhsV0[M]; \ + func(func1,func2,0); + +#define MICRO_ONE_PEEL4 \ + MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += (accRows * PEEL); + +#define MICRO_ONE4 \ + MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += accRows; + +#define MICRO_ONE_PEEL1 \ + MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += (remaining_cols * PEEL); + +#define MICRO_ONE1 \ + MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += remaining_cols; + +#define MICRO_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzero(accZero##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##iter); \ + } + +#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE) + +#define MICRO_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ + } + +#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE) + +#define MICRO_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ + } + +#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE) + +#define MICRO_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ + acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ + acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ + acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ + bscale(acc, accZero##iter, pAlpha); \ + res.template storePacketBlock(row + iter*accCols, col, acc); \ + } + +#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) + +#define MICRO_COL_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ + bscale(acc, accZero##iter, pAlpha); \ + res.template storePacketBlock(row + iter*accCols, col, acc); \ + } + +#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_unrolled_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index col, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; + + MICRO_SRC_PTR + MICRO_DST_PTR + + Index k = 0; + for(; k + PEEL <= depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + MICRO_PREFETCH + MICRO_ONE_PEEL4 + } + for(; k < depth; k++) + { + MICRO_ONE4 + } + MICRO_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index col, + Index remaining_cols, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; + + MICRO_SRC_PTR + MICRO_DST_PTR + + Index k = 0; + for(; k + PEEL <= depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + MICRO_PREFETCH + MICRO_ONE_PEEL1 + } + for(; k < depth; k++) + { + MICRO_ONE1 + } + MICRO_COL_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlpha) +{ +#define MAX_UNROLL 6 + while(row + MAX_UNROLL*accCols <= rows) { + gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + } + switch( (rows-row)/accCols ) { +#if MAX_UNROLL > 7 + case 7: + gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 6 + case 6: + gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 5 + case 5: + gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 4 + case 4: + gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 3 + case 3: + gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 2 + case 2: + gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 1 + case 1: + gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif + default: + break; + } +#undef MAX_UNROLL +} + +/**************** + * GEMM kernels * + * **************/ +template +EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlpha = pset1(alpha); + const Packet pMask = bmask((const int)(remaining_rows)); + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + Index row = 0; + +#define MAX_UNROLL 6 + while(row + MAX_UNROLL*accCols <= rows) { + gemm_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + } + switch( (rows-row)/accCols ) { +#if MAX_UNROLL > 7 + case 7: + gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 6 + case 6: + gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 5 + case 5: + gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 4 + case 4: + gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 3 + case 3: + gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 2 + case 2: + gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 1 + case 1: + gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif + default: + break; + } +#undef MAX_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); + + if (remaining_rows > 0) + { + gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); + } + rhs_base++; + } + } +} + +#define accColsC (accCols / 2) +#define advanceRows ((LhsIsReal) ? 1 : 2) +#define advanceCols ((RhsIsReal) ? 1 : 2) + +// PEEL_COMPLEX loop factor. +#define PEEL_COMPLEX 3 + +template +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( + const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, + const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, + PacketBlock &accReal, PacketBlock &accImag, + Index remaining_rows, + Index remaining_cols) +{ + Packet rhsV[1], rhsVi[1]; + rhsV[0] = pset1(rhs_ptr_real[0]); + if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); + pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + rhs_ptr_real += remaining_cols; + if(!RhsIsReal) rhs_ptr_imag += remaining_cols; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); +} + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; + const Scalar* lhs_ptr_imag; + if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + PacketBlock accReal, accImag; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + + bsetzero(accReal); + bsetzero(accImag); + + Index remaining_depth = (depth & -accRows); + Index k = 0; + for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + for (int l = 0; l < PEEL_COMPLEX; l++) { + MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); + } + } + for(; k < remaining_depth; k++) + { + MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); + } + + for(; k < depth; k++) + { + Packet rhsV[1], rhsVi[1]; + rhsV[0] = pset1(rhs_ptr_real[0]); + if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); + pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + rhs_ptr_real += remaining_cols; + if(!RhsIsReal) rhs_ptr_imag += remaining_cols; + } + + bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); + bcouple_common(taccReal, taccImag, acc0, acc1); + + if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) + { + res(row + 0, col + 0) += pfirst(acc0.packet[0]); + } else { + acc0.packet[0] += res.template loadPacket(row + 0, col + 0); + res.template storePacketBlock(row + 0, col + 0, acc0); + if(remaining_rows > accColsC) { + res(row + accColsC, col + 0) += pfirst(acc1.packet[0]); + } + } +} + +template +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( + const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, + const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, + PacketBlock &accReal, PacketBlock &accImag, + Index remaining_rows) +{ + Packet rhsV[4], rhsVi[4]; + pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + rhs_ptr_real += accRows; + if(!RhsIsReal) rhs_ptr_imag += accRows; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); +} + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; + const Scalar* lhs_ptr_imag; + if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + PacketBlock accReal, accImag; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; + + bsetzero(accReal); + bsetzero(accImag); + + Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index k = 0; + for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + for (int l = 0; l < PEEL_COMPLEX; l++) { + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); + } + } + for(; k < remaining_depth; k++) + { + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); + } + + if ((remaining_depth == depth) && (rows >= accCols)) + { + bload(tRes, res, row, col); + bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); + bcouple(taccReal, taccImag, tRes, acc0, acc1); + res.template storePacketBlock(row + 0, col, acc0); + res.template storePacketBlock(row + accColsC, col, acc1); + } else { + for(; k < depth; k++) + { + Packet rhsV[4], rhsVi[4]; + pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + rhs_ptr_real += accRows; + if(!RhsIsReal) rhs_ptr_imag += accRows; + } + + bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); + bcouple_common(taccReal, taccImag, acc0, acc1); + + if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) + { + for(Index j = 0; j < 4; j++) { + res(row + 0, col + j) += pfirst(acc0.packet[j]); + } + } else { + for(Index j = 0; j < 4; j++) { + PacketBlock acc2; + acc2.packet[0] = res.template loadPacket(row + 0, col + j) + acc0.packet[j]; + res.template storePacketBlock(row + 0, col + j, acc2); + if(remaining_rows > accColsC) { + res(row + accColsC, col + j) += pfirst(acc1.packet[j]); + } + } + } + } +} + +#define MICRO_COMPLEX_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) + +#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ + MICRO_COMPLEX_UNROLL(func2); \ + func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) + +#define MICRO_COMPLEX_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ + lhs_ptr_real##iter += accCols; \ + if(!LhsIsReal) { \ + lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ + lhs_ptr_imag##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } + +#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ + if (unroll_factor > iter) { \ + pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + } + +#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ + if (unroll_factor > iter) { \ + pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + } + +#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ + if (PEEL_COMPLEX > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + pbroadcast4_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + if(!RhsIsReal) { \ + pbroadcast4_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ + MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ + if (PEEL_COMPLEX > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + rhsV##peel[0] = pset1(rhs_ptr_real[remaining_cols * peel]); \ + if(!RhsIsReal) { \ + rhsVi##peel[0] = pset1(rhs_ptr_imag[remaining_cols * peel]); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ + MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ + Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ + func(func1,func2,0); func(func1,func2,1); \ + func(func1,func2,2); func(func1,func2,3); \ + func(func1,func2,4); func(func1,func2,5); \ + func(func1,func2,6); func(func1,func2,7); \ + func(func1,func2,8); func(func1,func2,9); + +#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ + Packet rhsV0[M], rhsVi0[M];\ + func(func1,func2,0); + +#define MICRO_COMPLEX_ONE_PEEL4 \ + MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += (accRows * PEEL_COMPLEX); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX); + +#define MICRO_COMPLEX_ONE4 \ + MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += accRows; \ + if(!RhsIsReal) rhs_ptr_imag += accRows; + +#define MICRO_COMPLEX_ONE_PEEL1 \ + MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ + if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); + +#define MICRO_COMPLEX_ONE1 \ + MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += remaining_cols; \ + if(!RhsIsReal) rhs_ptr_imag += remaining_cols; + +#define MICRO_COMPLEX_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzero(accReal##iter); \ + bsetzero(accImag##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##iter); \ + EIGEN_UNUSED_VARIABLE(accImag##iter); \ + } + +#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE) + +#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ + if(!LhsIsReal) { \ + lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } + +#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) + +#define MICRO_COMPLEX_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ + if(!LhsIsReal) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ + } \ + } + +#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) + +#define MICRO_COMPLEX_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + bload(tRes, res, row + iter*accCols, col); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ + res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + } + +#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) + +#define MICRO_COMPLEX_COL_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + bload(tRes, res, row + iter*accCols, col); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ + res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + } + +#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index col, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock accReal4, accImag4; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; + + MICRO_COMPLEX_SRC_PTR + MICRO_COMPLEX_DST_PTR + + Index k = 0; + for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + MICRO_COMPLEX_PREFETCH + MICRO_COMPLEX_ONE_PEEL4 + } + for(; k < depth; k++) + { + MICRO_COMPLEX_ONE4 + } + MICRO_COMPLEX_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index col, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + remaining_cols*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock accReal4, accImag4; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; + + MICRO_COMPLEX_SRC_PTR + MICRO_COMPLEX_DST_PTR + + Index k = 0; + for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + MICRO_COMPLEX_PREFETCH + MICRO_COMPLEX_ONE_PEEL1 + } + for(; k < depth; k++) + { + MICRO_COMPLEX_ONE1 + } + MICRO_COMPLEX_COL_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ +#define MAX_COMPLEX_UNROLL 3 + while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { + gemm_complex_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_UNROLL > 4 + case 4: + gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 3 + case 3: + gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 2 + case 2: + gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 1 + case 1: + gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_UNROLL +} + +template +EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlphaReal = pset1(alpha.real()); + const Packet pAlphaImag = pset1(alpha.imag()); + const Packet pMask = bmask((const int)(remaining_rows)); + + const Scalar* blockA = (Scalar *) blockAc; + const Scalar* blockB = (Scalar *) blockBc; + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + Index row = 0; + +#define MAX_COMPLEX_UNROLL 3 + while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { + gemm_complex_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_UNROLL > 4 + case 4: + gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 3 + case 3: + gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 2 + case 2: + gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 1 + case 1: + gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); + + if (remaining_rows > 0) + { + gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); + } + rhs_base++; + } + } +} + +#undef accColsC +#undef advanceCols +#undef advanceRows + +/************************************ + * ppc64le template specializations * + * **********************************/ +template +struct gemm_pack_lhs +{ + void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs +{ + void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +#if EIGEN_ALTIVEC_USE_CUSTOM_PACK +template +struct gemm_pack_rhs +{ + void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs +{ + void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} +#endif + +template +struct gemm_pack_lhs +{ + void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs +{ + void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +#if EIGEN_ALTIVEC_USE_CUSTOM_PACK +template +struct gemm_pack_rhs +{ + void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs +{ + void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} +#endif + +template +struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +// ********* gebp specializations ********* +template +struct gebp_kernel +{ + typedef typename quad_traits::vectortype Packet; + typedef typename quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const float* blockA, const float* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel + ::operator()(const DataMapper& res, const float* blockA, const float* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index); + + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemmMMA; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemmMMA; + } + else{ + gemm_function = &Eigen::internal::gemm; + } + #else + gemm_function = &Eigen::internal::gemm; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef Packet4f Packet; + typedef Packet2cf Packetc; + typedef Packet4f RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef Packet4f Packet; + typedef Packet2cf Packetc; + typedef Packet4f RhsPacket; + + void operator()(const DataMapper& res, const float* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const float* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const float*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef Packet4f Packet; + typedef Packet2cf Packetc; + typedef Packet4f RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const float* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const float* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const float*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel +{ + typedef typename quad_traits::vectortype Packet; + typedef typename quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const double* blockA, const double* blockB, + Index rows, Index depth, Index cols, double alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel + ::operator()(const DataMapper& res, const double* blockA, const double* blockB, + Index rows, Index depth, Index cols, double alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index); + + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemmMMA; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemmMMA; + } + else{ + gemm_function = &Eigen::internal::gemm; + } + #else + gemm_function = &Eigen::internal::gemm; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef quad_traits::vectortype Packet; + typedef Packet1cd Packetc; + typedef quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef quad_traits::vectortype Packet; + typedef Packet1cd Packetc; + typedef quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const double* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const double* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const double*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef quad_traits::vectortype Packet; + typedef Packet1cd Packetc; + typedef quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const double* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const double* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const double*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h new file mode 100644 index 00000000..33d54349 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -0,0 +1,221 @@ +//#define EIGEN_POWER_USE_PREFETCH // Use prefetching in gemm routines +#ifdef EIGEN_POWER_USE_PREFETCH +#define EIGEN_POWER_PREFETCH(p) prefetch(p) +#else +#define EIGEN_POWER_PREFETCH(p) +#endif + +namespace Eigen { + +namespace internal { + +template +EIGEN_STRONG_INLINE void gemm_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlpha); + +template +EIGEN_STRONG_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask); + +template +EIGEN_STRONG_INLINE void gemm_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlpha); + +template +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag); + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask); + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag); + +template +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); + +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); + +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); + +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); + +const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, + 16, 17, 18, 19, + 4, 5, 6, 7, + 20, 21, 22, 23}; + +const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, + 24, 25, 26, 27, + 12, 13, 14, 15, + 28, 29, 30, 31}; +//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 +const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23}; + +//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 +const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31}; + + +// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); + acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); + acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); + acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); + acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); + acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); + acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); +} + +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +{ + bcouple_common(taccReal, taccImag, acc1, acc2); + + acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); + acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); + acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); + acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); + + acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); + acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); + acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); + acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); +} + +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +{ + bcouple_common(taccReal, taccImag, acc1, acc2); + + acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); + + acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); +} + +template<> +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); + acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); + acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); + acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); + acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); + acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); + acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); +} + +template<> +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); +} + +// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. +template +EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs) +{ + return ploadu(rhs); +} + +} // end namespace internal +} // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h new file mode 100644 index 00000000..6540c6fa --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -0,0 +1,629 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com) +// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H +#define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H + +#pragma GCC target("cpu=power10") + +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#endif + +namespace Eigen { + +namespace internal { + +template +EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) +{ + __builtin_mma_xxsetaccz(acc); +} + +template +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +{ + PacketBlock result; + __builtin_mma_disassemble_acc(&result.packet, acc); + + PacketBlock tRes; + bload(tRes, data, i, j); + + bscale(tRes, result, alpha); + + data.template storePacketBlock(i, j, tRes); +} + +template +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +{ + PacketBlock resultReal, resultImag; + __builtin_mma_disassemble_acc(&resultReal.packet, accReal); + __builtin_mma_disassemble_acc(&resultImag.packet, accImag); + + PacketBlock tRes; + bload(tRes, data, i, j); + + PacketBlock taccReal, taccImag; + bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag); + + PacketBlock acc1, acc2; + bcouple(taccReal, taccImag, tRes, acc1, acc2); + + data.template storePacketBlock(i + N*accColsC, j, acc1); + data.template storePacketBlock(i + (N+1)*accColsC, j, acc2); +} + +// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +{ + if(NegativeAccumulate) + { + __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } else { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) +{ + __vector_pair* a0 = (__vector_pair *)(&a.packet[0]); + if(NegativeAccumulate) + { + __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b); + } else { + __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) +{ + if(NegativeAccumulate) + { + __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b); + } else { + __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) +{ + // Just for compilation +} + +template +EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) +{ + pgerMMA(accReal, rhsV, lhsV); + if(LhsIsReal) { + pgerMMA(accImag, rhsVi, lhsV); + } else { + if(!RhsIsReal) { + pgerMMA(accReal, rhsVi, lhsVi); + pgerMMA(accImag, rhsVi, lhsV); + } else { + EIGEN_UNUSED_VARIABLE(rhsVi); + } + pgerMMA(accImag, rhsV, lhsVi); + } +} + +// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. +template +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) +{ + rhsV = ploadRhs((const Scalar*)(rhs)); +} + +template<> +EIGEN_ALWAYS_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) +{ + rhsV.packet[0] = ploadRhs((const double *)((Packet2d *)rhs )); + rhsV.packet[1] = ploadRhs((const double *)(((Packet2d *)rhs) + 1)); +} + +template<> +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) +{ +#if EIGEN_COMP_LLVM + __builtin_vsx_assemble_pair(&rhsV, + (__vector unsigned char)(ploadRhs((const double *)(((Packet2d *)rhs) + 1))), + (__vector unsigned char)(ploadRhs((const double *)((Packet2d *)rhs )))); +#else + __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs)); +#endif +} + +template<> +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) +{ + // Just for compilation +} + +// PEEL_MMA loop factor. +#define PEEL_MMA 7 + +#define MICRO_MMA_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_MMA_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr##iter); \ + lhs_ptr##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + } + +#define MICRO_MMA_WORK_ONE(iter, type, peel) \ + if (unroll_factor > iter) { \ + pgerMMA(&accZero##iter, rhsV##peel, lhsV##iter); \ + } + +#define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \ + if (PEEL_MMA > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ + ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV##peel); \ + MICRO_MMA_UNROLL(func2); \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ + func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ + MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9); + +#define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \ + type rhsV0; \ + MICRO_MMA_TYPE_PEEL(func,func2,type,0); + +#define MICRO_MMA_ONE_PEEL \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr += (accRows * PEEL_MMA); + +#define MICRO_MMA_ONE \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr += accRows; + +#define MICRO_MMA_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzeroMMA(&accZero##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##iter); \ + } + +#define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE) + +#define MICRO_MMA_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ + } + +#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE) + +#define MICRO_MMA_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ + } + +#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE) + +#define MICRO_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + storeAccumulator(row + iter*accCols, col, res, pAlpha, &accZero##iter); \ + } + +#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index col, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; + __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + + MICRO_MMA_SRC_PTR + MICRO_MMA_DST_PTR + + Index k = 0; + for(; k + PEEL_MMA <= depth; k+= PEEL_MMA) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + MICRO_MMA_PREFETCH + MICRO_MMA_ONE_PEEL + } + for(; k < depth; k++) + { + MICRO_MMA_ONE + } + MICRO_MMA_STORE + + row += unroll_factor*accCols; +} + +template +void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlpha = pset1(alpha); + const Packet pMask = bmask((const int)(remaining_rows)); + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + + Index row = 0; +#define MAX_MMA_UNROLL 7 + while(row + MAX_MMA_UNROLL*accCols <= rows) { + gemm_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + } + switch( (rows-row)/accCols ) { +#if MAX_MMA_UNROLL > 7 + case 7: + gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 6 + case 6: + gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 5 + case 5: + gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 4 + case 4: + gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 3 + case 3: + gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 2 + case 2: + gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 1 + case 1: + gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif + default: + break; + } +#undef MAX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); + + if (remaining_rows > 0) + { + gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); + } + rhs_base++; + } + } +} + +#define accColsC (accCols / 2) +#define advanceRows ((LhsIsReal) ? 1 : 2) +#define advanceCols ((RhsIsReal) ? 1 : 2) + +// PEEL_COMPLEX_MMA loop factor. +#define PEEL_COMPLEX_MMA 7 + +#define MICRO_COMPLEX_MMA_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) + +#define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ + lhs_ptr_real##iter += accCols; \ + if(!LhsIsReal) { \ + lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ + lhs_ptr_imag##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } + +#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \ + if (unroll_factor > iter) { \ + pgercMMA(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + } + +#define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \ + if (PEEL_COMPLEX_MMA > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ + if(!RhsIsReal) { \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ + MICRO_COMPLEX_MMA_UNROLL(func2); \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ + type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9); + +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \ + type rhsV0, rhsVi0; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); + +#define MICRO_COMPLEX_MMA_ONE_PEEL \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA); + +#define MICRO_COMPLEX_MMA_ONE \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr_real += accRows; \ + if(!RhsIsReal) rhs_ptr_imag += accRows; + +#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzeroMMA(&accReal##iter); \ + bsetzeroMMA(&accImag##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##iter); \ + EIGEN_UNUSED_VARIABLE(accImag##iter); \ + } + +#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE) + +#define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ + if(!LhsIsReal) { \ + lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } + +#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE) + +#define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ + if(!LhsIsReal) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ + } \ + } + +#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE) + +#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + storeComplexAccumulator(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ + } + +#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index col, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; + __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; + + MICRO_COMPLEX_MMA_SRC_PTR + MICRO_COMPLEX_MMA_DST_PTR + + Index k = 0; + for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + MICRO_COMPLEX_MMA_PREFETCH + MICRO_COMPLEX_MMA_ONE_PEEL + } + for(; k < depth; k++) + { + MICRO_COMPLEX_MMA_ONE + } + MICRO_COMPLEX_MMA_STORE + + row += unroll_factor*accCols; +} + +template +void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlphaReal = pset1(alpha.real()); + const Packet pAlphaImag = pset1(alpha.imag()); + const Packet pMask = bmask((const int)(remaining_rows)); + + const Scalar* blockA = (Scalar *) blockAc; + const Scalar* blockB = (Scalar *) blockBc; + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + Index row = 0; + +#define MAX_COMPLEX_MMA_UNROLL 4 + while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { + gemm_complex_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_MMA_UNROLL > 4 + case 4: + gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 3 + case 3: + gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 2 + case 2: + gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 1 + case 1: + gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); + + if (remaining_rows > 0) + { + gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); + } + rhs_base++; + } + } +} + +#undef accColsC +#undef advanceRows +#undef advanceCols + +#pragma GCC reset_options +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H + diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h b/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h index 08a27d15..2a440545 100755 --- a/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -22,31 +22,38 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -typedef __vector float Packet4f; -typedef __vector int Packet4i; -typedef __vector unsigned int Packet4ui; -typedef __vector __bool int Packet4bi; -typedef __vector short int Packet8i; -typedef __vector unsigned char Packet16uc; +typedef __vector float Packet4f; +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8s; +typedef __vector unsigned short int Packet8us; +typedef __vector signed char Packet16c; +typedef __vector unsigned char Packet16uc; +typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf; // We don't want to write the same code all the time, but we need to reuse the constants // and it doesn't really work to declare them global, so we define macros instead - #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + Packet4f p4f_##NAME = {X, X, X, X} #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = vec_splat_s32(X) +#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \ + Packet4ui p4ui_##NAME = {X, X, X, X} + +#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \ + Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X} + +#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \ + Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X} + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = pset1(X) @@ -64,7 +71,7 @@ typedef __vector unsigned char Packet16uc; #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) - +#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits::type // These constants are endian-agnostic static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} @@ -72,25 +79,36 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u); +static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); +static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} +static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} #ifndef __VSX__ static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} #endif -static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; -static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; +static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; + +static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; +static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; -static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; +static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 }; +static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; -// Mask alignment -#ifdef __PPC64__ -#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 -#else -#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 -#endif +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; +static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 }; +static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 }; +static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 }; +static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 }; -#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) +static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 }; // Handle endianness properly while loading constants // Define global static constants: @@ -129,27 +147,27 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_L #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #endif -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4, + size = 4, HasHalfPacket = 1, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, #ifdef __VSX__ HasSqrt = 1, #if !EIGEN_COMP_CLANG @@ -160,16 +178,62 @@ template<> struct packet_traits : default_packet_traits #else HasSqrt = 0, HasRsqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, #endif HasRound = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasNegate = 1, HasBlend = 1 }; }; -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { + typedef Packet8bf type; + typedef Packet8bf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, +#ifdef __VSX__ + HasSqrt = 1, +#if !EIGEN_COMP_CLANG + HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif +#else + HasSqrt = 0, + HasRsqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, +#endif + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { typedef Packet4i type; typedef Packet4i half; enum { @@ -178,6 +242,79 @@ template<> struct packet_traits : default_packet_traits size = 4, HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8s type; + typedef Packet8s half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8us type; + typedef Packet8us half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16c type; + typedef Packet16c half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16uc type; + typedef Packet16uc half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -186,9 +323,62 @@ template<> struct packet_traits : default_packet_traits }; }; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef int type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef short int type; + typedef Packet8s half; + enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef unsigned short int type; + typedef Packet8us half; + enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; + +template<> struct unpacket_traits +{ + typedef signed char type; + typedef Packet16c half; + enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef unsigned char type; + typedef Packet16uc half; + enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits +{ + typedef bfloat16 type; + typedef Packet8bf half; + enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +inline std::ostream & operator <<(std::ostream & s, const Packet16c & v) +{ + union { + Packet16c v; + signed char n[16]; + } vt; + vt.v = v; + for (int i=0; i< 16; i++) + s << vt.n[i] << ", "; + return s; +} inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) { @@ -198,7 +388,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) } vt; vt.v = v; for (int i=0; i< 16; i++) - s << (int)vt.n[i] << ", "; + s << vt.n[i] << ", "; return s; } @@ -235,148 +425,397 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) return s; } -// Need to define them first or we get specialization after instantiation errors -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +template +EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) { + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD #ifdef __VSX__ - return vec_vsx_ld(0, from); + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else return vec_ld(0, from); #endif } +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + return pload_common(from); +} + template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { - EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif + return pload_common(from); } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +template<> EIGEN_STRONG_INLINE Packet8s pload(const short int* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet8us pload(const unsigned short int* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet16c pload(const signed char* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet16uc pload(const unsigned char* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pload(const bfloat16* from) { + return pload_common(reinterpret_cast(from)); +} + +template +EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ + // some versions of GCC throw "unused-but-set-parameter" (float *to). + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE #ifdef __VSX__ - vec_vsx_st(from, 0, to); + vec_xst(from, 0, to); #else vec_st(from, 0, to); #endif } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + pstore_common(to, from); +} + template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + pstore_common(to, from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - Packet4f v = {from, from, from, from}; +template<> EIGEN_STRONG_INLINE void pstore(short int* to, const Packet8s& from) +{ + pstore_common(to, from); +} + +template<> EIGEN_STRONG_INLINE void pstore(unsigned short int* to, const Packet8us& from) +{ + pstore_common(to, from); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet8bf& from) +{ + pstore_common(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstore(signed char* to, const Packet16c& from) +{ + pstore_common(to, from); +} + +template<> EIGEN_STRONG_INLINE void pstore(unsigned char* to, const Packet16uc& from) +{ + pstore_common(to, from); +} + +template +EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from}; return v; } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { - Packet4i v = {from, from, from, from}; +template +EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from, from, from, from, from}; return v; } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) + +template +EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; + return v; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + return pset1_size4(from); +} + +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { + return pset1_size4(from); +} + +template<> EIGEN_STRONG_INLINE Packet8s pset1(const short int& from) { + return pset1_size8(from); +} + +template<> EIGEN_STRONG_INLINE Packet8us pset1(const unsigned short int& from) { + return pset1_size8(from); +} + +template<> EIGEN_STRONG_INLINE Packet16c pset1(const signed char& from) { + return pset1_size16(from); +} + +template<> EIGEN_STRONG_INLINE Packet16uc pset1(const unsigned char& from) { + return pset1_size16(from); +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { + return reinterpret_cast(pset1(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pset1(const bfloat16& from) { + return pset1_size8(reinterpret_cast(from)); +} + +template EIGEN_STRONG_INLINE void +pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - a3 = pload(a); + a3 = pload(a); a0 = vec_splat(a3, 0); a1 = vec_splat(a3, 1); a2 = vec_splat(a3, 2); a3 = vec_splat(a3, 3); } + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + pbroadcast4_common(a, a0, a1, a2, a3); +} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) { - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); + pbroadcast4_common(a, a0, a1, a2, a3); +} + +template EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + return pload(a); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - af[2] = from[2*stride]; - af[3] = from[3*stride]; - return pload(af); + return pgather_common(from, stride); } + template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) + +template EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) { - float EIGEN_ALIGN16 af[4]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; - to[2*stride] = af[2]; - to[3*stride] = af[3]; + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + a[4] = from[4*stride]; + a[5] = from[5*stride]; + a[6] = from[6*stride]; + a[7] = from[7*stride]; + return pload(a); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) + +template<> EIGEN_DEVICE_FUNC inline Packet8s pgather(const short int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; - pstore((int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; + return pgather_size8(from, stride); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } - -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return a + b; } +template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(const unsigned short int* from, Index stride) +{ + return pgather_size8(from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return a - b; } +template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather(const bfloat16* from, Index stride) +{ + return pgather_size8(from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } +template EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + a[4] = from[4*stride]; + a[5] = from[5*stride]; + a[6] = from[6*stride]; + a[7] = from[7*stride]; + a[8] = from[8*stride]; + a[9] = from[9*stride]; + a[10] = from[10*stride]; + a[11] = from[11*stride]; + a[12] = from[12*stride]; + a[13] = from[13*stride]; + a[14] = from[14*stride]; + a[15] = from[15*stride]; + return pload(a); +} -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return a * b; } +template<> EIGEN_DEVICE_FUNC inline Packet16c pgather(const signed char* from, Index stride) +{ + return pgather_size16(from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather(const unsigned char* from, Index stride) { -#ifndef __VSX__ // VSX actually provides a div instruction - Packet4f t, y_0, y_1; + return pgather_size16(from, stride); +} - // Altivec does not offer a divide instruction, we have to do a reciprocal approximation - y_0 = vec_re(b); +template EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; +} - // Do one Newton-Raphson iteration to get the needed accuracy - t = vec_nmsub(y_0, b, p4f_ONE); - y_1 = vec_madd(y_0, t, y_0); +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + pscatter_size4(to, from, stride); +} - return vec_madd(a, y_1, p4f_MZERO); -#else - return vec_div(a, b); -#endif +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + pscatter_size4(to, from, stride); +} + +template EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; + to[4*stride] = a[4]; + to[5*stride] = a[5]; + to[6*stride] = a[6]; + to[7*stride] = a[7]; +} + + +template<> EIGEN_DEVICE_FUNC inline void pscatter(short int* to, const Packet8s& from, Index stride) +{ + pscatter_size8(to, from, stride); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned short int* to, const Packet8us& from, Index stride) +{ + pscatter_size8(to, from, stride); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +{ + pscatter_size8(to, from, stride); +} + +template EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; + to[4*stride] = a[4]; + to[5*stride] = a[5]; + to[6*stride] = a[6]; + to[7*stride] = a[7]; + to[8*stride] = a[8]; + to[9*stride] = a[9]; + to[10*stride] = a[10]; + to[11*stride] = a[11]; + to[12*stride] = a[12]; + to[13*stride] = a[13]; + to[14*stride] = a[14]; + to[15*stride] = a[15]; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(signed char* to, const Packet16c& from, Index stride) +{ + pscatter_size16(to, from, stride); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned char* to, const Packet16uc& from, Index stride) +{ + pscatter_size16(to, from, stride); +} + +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet8s plset(const short int& a) { return pset1(a) + p8s_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet8us plset(const unsigned short int& a) { return pset1(a) + p8us_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet16c plset(const signed char& a) { return pset1(a) + p16c_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet16uc plset(const unsigned char& a) { return pset1(a) + p16uc_COUNTDOWN; } + +template<> EIGEN_STRONG_INLINE Packet4f padd (const Packet4f& a, const Packet4f& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4i padd (const Packet4i& a, const Packet4i& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4ui padd (const Packet4ui& a, const Packet4ui& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet8s padd (const Packet8s& a, const Packet8s& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet8us padd (const Packet8us& a, const Packet8us& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet16c padd (const Packet16c& a, const Packet16c& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { return a + b; } + +template<> EIGEN_STRONG_INLINE Packet4f psub (const Packet4f& a, const Packet4f& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4i psub (const Packet4i& a, const Packet4i& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet8s psub (const Packet8s& a, const Packet8s& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet8us psub (const Packet8us& a, const Packet8us& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet16c psub (const Packet16c& a, const Packet16c& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return a - b; } + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } + +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4f pmul (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } +template<> EIGEN_STRONG_INLINE Packet4i pmul (const Packet4i& a, const Packet4i& b) { return a * b; } +template<> EIGEN_STRONG_INLINE Packet8s pmul (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmul (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmul (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ +#ifndef __VSX__ // VSX actually provides a div instruction + Packet4f t, y_0, y_1; + + // Altivec does not offer a divide instruction, we have to do a reciprocal approximation + y_0 = vec_re(b); + + // Do one Newton-Raphson iteration to get the needed accuracy + t = vec_nmsub(y_0, b, p4f_ONE); + y_1 = vec_madd(y_0, t, y_0); + + return vec_madd(a, y_1, p4f_MZERO); +#else + return vec_div(a, b); +#endif } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) @@ -387,10 +826,13 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } +template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { #ifdef __VSX__ + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -399,10 +841,16 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); } + template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { #ifdef __VSX__ + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -411,79 +859,214 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { + Packet4f c = reinterpret_cast(vec_cmpge(a,b)); + return vec_nor(c,c); +} + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4ui pand(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us pand(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a, const Packet8bf& b) { + return pand(a, b); +} + template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet8s por(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us por(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a, const Packet8bf& b) { + return por(a, b); +} template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, const Packet8bf& b) { + return pxor(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { + return vec_sel(b, a, reinterpret_cast(mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); + Packet4f res; + +#ifdef __VSX__ + __asm__("xvrspiz %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (t)); +#else + __asm__("vrfiz %0, %1\n\t" + : "=v" (res) + : "v" (t)); +#endif + + return res; +} template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) +{ + Packet4f res; -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) + __asm__("xvrspic %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (a)); + + return res; +} + +template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) { EIGEN_DEBUG_ALIGNED_LOAD +#ifdef _BIG_ENDIAN Packet16uc MSQ, LSQ; Packet16uc mask; MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword mask = vec_lvsl(0, from); // create the permute mask - return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data + //TODO: Add static_cast here + return (Packet) vec_perm(MSQ, LSQ, mask); // align the data +#else + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + return ploadu_common(from); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data + return ploadu_common(from); } -#else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet8s ploadu(const short int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); + return ploadu_common(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet8us ploadu(const unsigned short int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); + return ploadu_common(from); +} +template<> EIGEN_STRONG_INLINE Packet8bf ploadu(const bfloat16* from) +{ + return ploadu_common(reinterpret_cast(from)); +} +template<> EIGEN_STRONG_INLINE Packet16c ploadu(const signed char* from) +{ + return ploadu_common(from); +} +template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const unsigned char* from) +{ + return ploadu_common(from); } -#endif -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +template EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) { - Packet4f p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + Packet p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); return vec_perm(p, p, p16uc_DUPLICATE32_HI); } +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + return ploaddup_common(from); +} template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { - Packet4i p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); + return ploaddup_common(from); } -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const short int* from) +{ + Packet8s p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const unsigned short int* from) +{ + Packet8us p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const short int* from) +{ + Packet8s p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8us ploadquad(const unsigned short int* from) +{ + Packet8us p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ploadquad(const bfloat16* from) +{ + return ploadquad(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const signed char* from) +{ + Packet16c p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE8_HI); +} + +template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const unsigned char* from) +{ + Packet16uc p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE8_HI); +} + +template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) { EIGEN_DEBUG_UNALIGNED_STORE +#ifdef _BIG_ENDIAN // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Warning: not thread safe! Packet16uc MSQ, LSQ, edges; @@ -497,45 +1080,69 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& f MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second +#else + vec_xst(from, 0, to); +#endif +} +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ + pstoreu_common(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + pstoreu_common(to, from); } -#else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) +template<> EIGEN_STRONG_INLINE void pstoreu(short int* to, const Packet8s& from) { - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); + pstoreu_common(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +template<> EIGEN_STRONG_INLINE void pstoreu(unsigned short int* to, const Packet8us& from) { - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); + pstoreu_common(to, from); +} +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet8bf& from) +{ + pstoreu_common(reinterpret_cast(to), from); +} +template<> EIGEN_STRONG_INLINE void pstoreu(signed char* to, const Packet16c& from) +{ + pstoreu_common(to, from); +} +template<> EIGEN_STRONG_INLINE void pstoreu(unsigned char* to, const Packet16uc& from) +{ + pstoreu_common(to, from); } -#endif template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } + +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) { + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x; + vec_ste(a, 0, &x); + return x; +} + +template<> EIGEN_STRONG_INLINE short int pfirst(const Packet8s& a) { + return pfirst_common(a); +} + +template<> EIGEN_STRONG_INLINE unsigned short int pfirst(const Packet8us& a) { + return pfirst_common(a); +} + +template<> EIGEN_STRONG_INLINE signed char pfirst(const Packet16c& a) +{ + return pfirst_common(a); +} + +template<> EIGEN_STRONG_INLINE unsigned char pfirst(const Packet16uc& a) +{ + return pfirst_common(a); +} template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -543,10 +1150,296 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} +template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE16)); +} +template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE16)); +} +template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) +{ + return vec_perm(a, a, p16uc_REVERSE8); +} +template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) +{ + return vec_perm(a, a, p16uc_REVERSE8); +} +template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) +{ + return preverse(a); +} template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { + _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF); + return pand(p8us_abs_mask, a); +} + +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) +{ return vec_sra(a,reinterpret_cast(pset1(N))); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) +{ return vec_sr(a,reinterpret_cast(pset1(N))); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) +{ return vec_sl(a,reinterpret_cast(pset1(N))); } +template EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + Packet4ui r = vec_sl(reinterpret_cast(a), p4ui_mask); + return reinterpret_cast(r); +} + +template EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + Packet4ui r = vec_sr(reinterpret_cast(a), p4ui_mask); + return reinterpret_cast(r); +} + +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + return vec_sr(a, p4ui_mask); +} + +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + return vec_sl(a, p4ui_mask); +} + +template EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); + return vec_sl(a, p8us_mask); +} +template EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); + return vec_sr(a, p8us_mask); +} + +EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){ + return plogical_shift_left<16>(reinterpret_cast(bf.m_val)); +} + +EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); + return pand( + reinterpret_cast(bf.m_val), + reinterpret_cast(p4ui_high_mask) + ); +} + +// Simple interleaving of bool masks, prevents true values from being +// converted to NaNs. +EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) { + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); + Packet4f bf_odd, bf_even; + bf_odd = pand(reinterpret_cast(p4ui_high_mask), odd); + bf_even = plogical_shift_right<16>(even); + return reinterpret_cast(por(bf_even, bf_odd)); +} + +EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ + Packet4ui input = reinterpret_cast(p4f); + Packet4ui lsb = plogical_shift_right<16>(input); + lsb = pand(lsb, reinterpret_cast(p4i_ONE)); + + _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu); + Packet4ui rounding_bias = padd(lsb, p4ui_BIAS); + input = padd(input, rounding_bias); + + //Test NaN and Subnormal - Begin + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000); + Packet4ui exp = pand(p4ui_exp_mask, reinterpret_cast(p4f)); + + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF); + Packet4ui mantissa = pand(p4ui_mantissa_mask, reinterpret_cast(p4f)); + + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000); + Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp); + Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast(p4i_ZERO)); + + Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast(p4i_ZERO)); + Packet4ui nan_selector = pandnot( + reinterpret_cast(is_max_exp), + reinterpret_cast(is_mant_zero) + ); + + Packet4ui subnormal_selector = pandnot( + reinterpret_cast(is_zero_exp), + reinterpret_cast(is_mant_zero) + ); + + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); + input = vec_sel(input, p4ui_nan, nan_selector); + input = vec_sel(input, reinterpret_cast(p4f), subnormal_selector); + //Test NaN and Subnormal - End + + input = plogical_shift_right<16>(input); + return reinterpret_cast(input); +} + +EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){ + Packet4f bf_odd, bf_even; + bf_odd = reinterpret_cast(F32ToBf16(odd).m_val); + bf_odd = plogical_shift_left<16>(bf_odd); + bf_even = reinterpret_cast(F32ToBf16(even).m_val); + return reinterpret_cast(por(bf_even, bf_odd)); +} +#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \ + Packet4f a_even = Bf16ToF32Even(A);\ + Packet4f a_odd = Bf16ToF32Odd(A);\ + Packet4f op_even = OP(a_even);\ + Packet4f op_odd = OP(a_odd);\ + return F32ToBf16(op_even, op_odd);\ + +#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \ + Packet4f a_even = Bf16ToF32Even(A);\ + Packet4f a_odd = Bf16ToF32Odd(A);\ + Packet4f b_even = Bf16ToF32Even(B);\ + Packet4f b_odd = Bf16ToF32Odd(B);\ + Packet4f op_even = OP(a_even, b_even);\ + Packet4f op_odd = OP(a_odd, b_odd);\ + return F32ToBf16(op_even, op_odd);\ + +#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \ + Packet4f a_even = Bf16ToF32Even(A);\ + Packet4f a_odd = Bf16ToF32Odd(A);\ + Packet4f b_even = Bf16ToF32Even(B);\ + Packet4f b_odd = Bf16ToF32Odd(B);\ + Packet4f op_even = OP(a_even, b_even);\ + Packet4f op_odd = OP(a_odd, b_odd);\ + return F32ToBf16Bool(op_even, op_odd);\ + +template<> EIGEN_STRONG_INLINE Packet8bf padd(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(padd, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmul(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pmul, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pdiv(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pdiv, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) { + BF16_TO_F32_UNARY_OP_WRAPPER(pnegate, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(psub, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_generic(a,exponent); +} +template<> EIGEN_STRONG_INLINE Packet8bf pldexp (const Packet8bf& a, const Packet8bf& exponent){ + BF16_TO_F32_BINARY_OP_WRAPPER(pldexp, a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_generic(a,exponent); +} +template<> EIGEN_STRONG_INLINE Packet8bf pfrexp (const Packet8bf& a, Packet8bf& e){ + Packet4f a_even = Bf16ToF32Even(a); + Packet4f a_odd = Bf16ToF32Odd(a); + Packet4f e_even; + Packet4f e_odd; + Packet4f op_even = pfrexp(a_even, e_even); + Packet4f op_odd = pfrexp(a_odd, e_odd); + e = F32ToBf16(e_even, e_odd); + return F32ToBf16(op_even, op_odd); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psin (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcos (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf plog (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pfloor (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pfloor, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pceil (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pceil, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pround (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pround, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf print (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(print, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + Packet4f a_even = Bf16ToF32Even(a); + Packet4f a_odd = Bf16ToF32Odd(a); + Packet4f b_even = Bf16ToF32Even(b); + Packet4f b_odd = Bf16ToF32Odd(b); + Packet4f c_even = Bf16ToF32Even(c); + Packet4f c_odd = Bf16ToF32Odd(c); + Packet4f pmadd_even = pmadd(a_even, b_even, c_even); + Packet4f pmadd_odd = pmadd(a_odd, b_odd, c_odd); + return F32ToBf16(pmadd_even, pmadd_odd); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmin(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pmin, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmax(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pmax, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt, a, b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan, a, b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le, a, b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq, a, b); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) { + return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ploaddup(const bfloat16* from) +{ + return ploaddup(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8bf plset(const bfloat16& a) { + bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3), + bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) }; + return padd(pset1(a), pload(countdown)); +} template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { @@ -558,34 +1451,6 @@ template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) return pfirst(sum); } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - Packet4f v[4], sum[4]; - - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); - - // Now do the summation: - // Lines 0+1 - sum[0] = sum[0] + sum[1]; - // Lines 2+3 - sum[1] = sum[2] + sum[3]; - // Add the results - sum[0] = sum[0] + sum[1]; - - return sum[0]; -} - template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i sum; @@ -598,141 +1463,377 @@ template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) return pfirst(sum); } -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) +{ + float redux_even = predux(Bf16ToF32Even(a)); + float redux_odd = predux(Bf16ToF32Odd(a)); + float f32_result = redux_even + redux_odd; + return bfloat16(f32_result); +} +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) +{ + union{ + Packet v; + __UNPACK_TYPE__(Packet) n[8]; + } vt; + vt.v = a; + + EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; + EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; + Packet4i first_half = pload(first_loader); + Packet4i second_half = pload(second_loader); + + return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half)); +} + +template<> EIGEN_STRONG_INLINE short int predux(const Packet8s& a) +{ + return predux_size8(a); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux(const Packet8us& a) +{ + return predux_size8(a); +} + +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) +{ + union{ + Packet v; + __UNPACK_TYPE__(Packet) n[16]; + } vt; + vt.v = a; + + EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; + EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; + EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] }; + EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] }; + + Packet4i first_quarter = pload(first_loader); + Packet4i second_quarter = pload(second_loader); + Packet4i third_quarter = pload(third_loader); + Packet4i fourth_quarter = pload(fourth_loader); + + return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + + predux(third_quarter) + predux(fourth_quarter)); +} + +template<> EIGEN_STRONG_INLINE signed char predux(const Packet16c& a) +{ + return predux_size16(a); +} + +template<> EIGEN_STRONG_INLINE unsigned char predux(const Packet16uc& a) +{ + return predux_size16(a); +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); +} + +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} + +template<> EIGEN_STRONG_INLINE short int predux_mul(const Packet8s& a) +{ + Packet8s pair, quad, octo; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux_mul(const Packet8us& a) +{ + Packet8us pair, quad, octo; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) +{ + float redux_even = predux_mul(Bf16ToF32Even(a)); + float redux_odd = predux_mul(Bf16ToF32Odd(a)); + float f32_result = redux_even * redux_odd; + return bfloat16(f32_result); +} + + +template<> EIGEN_STRONG_INLINE signed char predux_mul(const Packet16c& a) +{ + Packet16c pair, quad, octo, result; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + result = vec_mul(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} + +template<> EIGEN_STRONG_INLINE unsigned char predux_mul(const Packet16uc& a) +{ + Packet16uc pair, quad, octo, result; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + result = vec_mul(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} + +// min +template EIGEN_STRONG_INLINE +__UNPACK_TYPE__(Packet) predux_min4(const Packet& a) +{ + Packet b, res; + b = vec_min(a, vec_sld(a, a, 8)); + res = vec_min(b, vec_sld(b, b, 4)); + return pfirst(res); +} + + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + return predux_min4(a); +} + +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ + return predux_min4(a); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) +{ + float redux_even = predux_min(Bf16ToF32Even(a)); + float redux_odd = predux_min(Bf16ToF32Odd(a)); + float f32_result = (std::min)(redux_even, redux_odd); + return bfloat16(f32_result); +} + +template<> EIGEN_STRONG_INLINE short int predux_min(const Packet8s& a) +{ + Packet8s pair, quad, octo; + + //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } + pair = vec_min(a, vec_sld(a, a, 8)); + + //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } + quad = vec_min(pair, vec_sld(pair, pair, 4)); + + //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_min(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux_min(const Packet8us& a) +{ + Packet8us pair, quad, octo; + + //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } + pair = vec_min(a, vec_sld(a, a, 8)); + + //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } + quad = vec_min(pair, vec_sld(pair, pair, 4)); + + //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_min(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE signed char predux_min(const Packet16c& a) +{ + Packet16c pair, quad, octo, result; + + pair = vec_min(a, vec_sld(a, a, 8)); + quad = vec_min(pair, vec_sld(pair, pair, 4)); + octo = vec_min(quad, vec_sld(quad, quad, 2)); + result = vec_min(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} + +template<> EIGEN_STRONG_INLINE unsigned char predux_min(const Packet16uc& a) +{ + Packet16uc pair, quad, octo, result; + + pair = vec_min(a, vec_sld(a, a, 8)); + quad = vec_min(pair, vec_sld(pair, pair, 4)); + octo = vec_min(quad, vec_sld(quad, quad, 2)); + result = vec_min(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} +// max +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) +{ + Packet b, res; + b = vec_max(a, vec_sld(a, a, 8)); + res = vec_max(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + return predux_max4(a); +} + +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ + return predux_max4(a); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) +{ + float redux_even = predux_max(Bf16ToF32Even(a)); + float redux_odd = predux_max(Bf16ToF32Odd(a)); + float f32_result = (std::max)(redux_even, redux_odd); + return bfloat16(f32_result); +} + +template<> EIGEN_STRONG_INLINE short int predux_max(const Packet8s& a) +{ + Packet8s pair, quad, octo; + + //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } + pair = vec_max(a, vec_sld(a, a, 8)); + + //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } + quad = vec_max(pair, vec_sld(pair, pair, 4)); + + //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_max(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux_max(const Packet8us& a) { - Packet4i v[4], sum[4]; + Packet8us pair, quad, octo; + + //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } + pair = vec_max(a, vec_sld(a, a, 8)); - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); + //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } + quad = vec_max(pair, vec_sld(pair, pair, 4)); - // Now do the summation: - // Lines 0+1 - sum[0] = sum[0] + sum[1]; - // Lines 2+3 - sum[1] = sum[2] + sum[3]; - // Add the results - sum[0] = sum[0] + sum[1]; - - return sum[0]; + //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_max(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); } -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +template<> EIGEN_STRONG_INLINE signed char predux_max(const Packet16c& a) { - Packet4f prod; - prod = pmul(a, vec_sld(a, a, 8)); - return pfirst(pmul(prod, vec_sld(prod, prod, 4))); + Packet16c pair, quad, octo, result; + + pair = vec_max(a, vec_sld(a, a, 8)); + quad = vec_max(pair, vec_sld(pair, pair, 4)); + octo = vec_max(quad, vec_sld(quad, quad, 2)); + result = vec_max(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); } -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +template<> EIGEN_STRONG_INLINE unsigned char predux_max(const Packet16uc& a) { - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - return aux[0] * aux[1] * aux[2] * aux[3]; + Packet16uc pair, quad, octo, result; + + pair = vec_max(a, vec_sld(a, a, 8)); + quad = vec_max(pair, vec_sld(pair, pair, 4)); + octo = vec_max(quad, vec_sld(quad, quad, 2)); + result = vec_max(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); } -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { - Packet4f b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); + return vec_any_ne(x, pzero(x)); } -template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) -{ - Packet4i b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); +template EIGEN_DEVICE_FUNC inline void +ptranpose_common(PacketBlock& kernel){ + T t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); } -// max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet4f b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranpose_common(kernel); } -template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) -{ - Packet4i b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranpose_common(kernel); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8s t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8us t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f t0, t1, t2, t3; +ptranspose(PacketBlock& kernel) { + Packet8us t0, t1, t2, t3; + + t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val); + t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val); + t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val); + t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet16c t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); @@ -743,9 +1844,10 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } + EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4i t0, t1, t2, t3; +ptranspose(PacketBlock& kernel) { + Packet16uc t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); @@ -756,18 +1858,398 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8s v[8], sum[8]; + + v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); + v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); + v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); + v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); + v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); + v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); + v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); + v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); + sum[0] = vec_mergeh(v[0], v[4]); + sum[1] = vec_mergel(v[0], v[4]); + sum[2] = vec_mergeh(v[1], v[5]); + sum[3] = vec_mergel(v[1], v[5]); + sum[4] = vec_mergeh(v[2], v[6]); + sum[5] = vec_mergel(v[2], v[6]); + sum[6] = vec_mergeh(v[3], v[7]); + sum[7] = vec_mergel(v[3], v[7]); + + kernel.packet[0] = vec_mergeh(sum[0], sum[4]); + kernel.packet[1] = vec_mergel(sum[0], sum[4]); + kernel.packet[2] = vec_mergeh(sum[1], sum[5]); + kernel.packet[3] = vec_mergel(sum[1], sum[5]); + kernel.packet[4] = vec_mergeh(sum[2], sum[6]); + kernel.packet[5] = vec_mergel(sum[2], sum[6]); + kernel.packet[6] = vec_mergeh(sum[3], sum[7]); + kernel.packet[7] = vec_mergel(sum[3], sum[7]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8us v[8], sum[8]; + + v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); + v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); + v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); + v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); + v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); + v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); + v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); + v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); + sum[0] = vec_mergeh(v[0], v[4]); + sum[1] = vec_mergel(v[0], v[4]); + sum[2] = vec_mergeh(v[1], v[5]); + sum[3] = vec_mergel(v[1], v[5]); + sum[4] = vec_mergeh(v[2], v[6]); + sum[5] = vec_mergel(v[2], v[6]); + sum[6] = vec_mergeh(v[3], v[7]); + sum[7] = vec_mergel(v[3], v[7]); + + kernel.packet[0] = vec_mergeh(sum[0], sum[4]); + kernel.packet[1] = vec_mergel(sum[0], sum[4]); + kernel.packet[2] = vec_mergeh(sum[1], sum[5]); + kernel.packet[3] = vec_mergel(sum[1], sum[5]); + kernel.packet[4] = vec_mergeh(sum[2], sum[6]); + kernel.packet[5] = vec_mergel(sum[2], sum[6]); + kernel.packet[6] = vec_mergeh(sum[3], sum[7]); + kernel.packet[7] = vec_mergel(sum[3], sum[7]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8bf v[8], sum[8]; + + v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val); + v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val); + v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val); + v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val); + v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val); + v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val); + v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val); + v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val); + sum[0] = vec_mergeh(v[0].m_val, v[4].m_val); + sum[1] = vec_mergel(v[0].m_val, v[4].m_val); + sum[2] = vec_mergeh(v[1].m_val, v[5].m_val); + sum[3] = vec_mergel(v[1].m_val, v[5].m_val); + sum[4] = vec_mergeh(v[2].m_val, v[6].m_val); + sum[5] = vec_mergel(v[2].m_val, v[6].m_val); + sum[6] = vec_mergeh(v[3].m_val, v[7].m_val); + sum[7] = vec_mergel(v[3].m_val, v[7].m_val); + + kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val); + kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val); + kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val); + kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val); + kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val); + kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val); + kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val); + kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet16c step1[16], step2[16], step3[16]; + + step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); + step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); + step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); + step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); + step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); + step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); + step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); + step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); + step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); + step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); + step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); + step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); + step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); + step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); + step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); + step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); + + step2[0] = vec_mergeh(step1[0], step1[8]); + step2[1] = vec_mergel(step1[0], step1[8]); + step2[2] = vec_mergeh(step1[1], step1[9]); + step2[3] = vec_mergel(step1[1], step1[9]); + step2[4] = vec_mergeh(step1[2], step1[10]); + step2[5] = vec_mergel(step1[2], step1[10]); + step2[6] = vec_mergeh(step1[3], step1[11]); + step2[7] = vec_mergel(step1[3], step1[11]); + step2[8] = vec_mergeh(step1[4], step1[12]); + step2[9] = vec_mergel(step1[4], step1[12]); + step2[10] = vec_mergeh(step1[5], step1[13]); + step2[11] = vec_mergel(step1[5], step1[13]); + step2[12] = vec_mergeh(step1[6], step1[14]); + step2[13] = vec_mergel(step1[6], step1[14]); + step2[14] = vec_mergeh(step1[7], step1[15]); + step2[15] = vec_mergel(step1[7], step1[15]); + + step3[0] = vec_mergeh(step2[0], step2[8]); + step3[1] = vec_mergel(step2[0], step2[8]); + step3[2] = vec_mergeh(step2[1], step2[9]); + step3[3] = vec_mergel(step2[1], step2[9]); + step3[4] = vec_mergeh(step2[2], step2[10]); + step3[5] = vec_mergel(step2[2], step2[10]); + step3[6] = vec_mergeh(step2[3], step2[11]); + step3[7] = vec_mergel(step2[3], step2[11]); + step3[8] = vec_mergeh(step2[4], step2[12]); + step3[9] = vec_mergel(step2[4], step2[12]); + step3[10] = vec_mergeh(step2[5], step2[13]); + step3[11] = vec_mergel(step2[5], step2[13]); + step3[12] = vec_mergeh(step2[6], step2[14]); + step3[13] = vec_mergel(step2[6], step2[14]); + step3[14] = vec_mergeh(step2[7], step2[15]); + step3[15] = vec_mergel(step2[7], step2[15]); + + kernel.packet[0] = vec_mergeh(step3[0], step3[8]); + kernel.packet[1] = vec_mergel(step3[0], step3[8]); + kernel.packet[2] = vec_mergeh(step3[1], step3[9]); + kernel.packet[3] = vec_mergel(step3[1], step3[9]); + kernel.packet[4] = vec_mergeh(step3[2], step3[10]); + kernel.packet[5] = vec_mergel(step3[2], step3[10]); + kernel.packet[6] = vec_mergeh(step3[3], step3[11]); + kernel.packet[7] = vec_mergel(step3[3], step3[11]); + kernel.packet[8] = vec_mergeh(step3[4], step3[12]); + kernel.packet[9] = vec_mergel(step3[4], step3[12]); + kernel.packet[10] = vec_mergeh(step3[5], step3[13]); + kernel.packet[11] = vec_mergel(step3[5], step3[13]); + kernel.packet[12] = vec_mergeh(step3[6], step3[14]); + kernel.packet[13] = vec_mergel(step3[6], step3[14]); + kernel.packet[14] = vec_mergeh(step3[7], step3[15]); + kernel.packet[15] = vec_mergel(step3[7], step3[15]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet16uc step1[16], step2[16], step3[16]; + + step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); + step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); + step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); + step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); + step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); + step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); + step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); + step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); + step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); + step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); + step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); + step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); + step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); + step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); + step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); + step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); + + step2[0] = vec_mergeh(step1[0], step1[8]); + step2[1] = vec_mergel(step1[0], step1[8]); + step2[2] = vec_mergeh(step1[1], step1[9]); + step2[3] = vec_mergel(step1[1], step1[9]); + step2[4] = vec_mergeh(step1[2], step1[10]); + step2[5] = vec_mergel(step1[2], step1[10]); + step2[6] = vec_mergeh(step1[3], step1[11]); + step2[7] = vec_mergel(step1[3], step1[11]); + step2[8] = vec_mergeh(step1[4], step1[12]); + step2[9] = vec_mergel(step1[4], step1[12]); + step2[10] = vec_mergeh(step1[5], step1[13]); + step2[11] = vec_mergel(step1[5], step1[13]); + step2[12] = vec_mergeh(step1[6], step1[14]); + step2[13] = vec_mergel(step1[6], step1[14]); + step2[14] = vec_mergeh(step1[7], step1[15]); + step2[15] = vec_mergel(step1[7], step1[15]); + + step3[0] = vec_mergeh(step2[0], step2[8]); + step3[1] = vec_mergel(step2[0], step2[8]); + step3[2] = vec_mergeh(step2[1], step2[9]); + step3[3] = vec_mergel(step2[1], step2[9]); + step3[4] = vec_mergeh(step2[2], step2[10]); + step3[5] = vec_mergel(step2[2], step2[10]); + step3[6] = vec_mergeh(step2[3], step2[11]); + step3[7] = vec_mergel(step2[3], step2[11]); + step3[8] = vec_mergeh(step2[4], step2[12]); + step3[9] = vec_mergel(step2[4], step2[12]); + step3[10] = vec_mergeh(step2[5], step2[13]); + step3[11] = vec_mergel(step2[5], step2[13]); + step3[12] = vec_mergeh(step2[6], step2[14]); + step3[13] = vec_mergel(step2[6], step2[14]); + step3[14] = vec_mergeh(step2[7], step2[15]); + step3[15] = vec_mergel(step2[7], step2[15]); + + kernel.packet[0] = vec_mergeh(step3[0], step3[8]); + kernel.packet[1] = vec_mergel(step3[0], step3[8]); + kernel.packet[2] = vec_mergeh(step3[1], step3[9]); + kernel.packet[3] = vec_mergel(step3[1], step3[9]); + kernel.packet[4] = vec_mergeh(step3[2], step3[10]); + kernel.packet[5] = vec_mergel(step3[2], step3[10]); + kernel.packet[6] = vec_mergeh(step3[3], step3[11]); + kernel.packet[7] = vec_mergel(step3[3], step3[11]); + kernel.packet[8] = vec_mergeh(step3[4], step3[12]); + kernel.packet[9] = vec_mergel(step3[4], step3[12]); + kernel.packet[10] = vec_mergeh(step3[5], step3[13]); + kernel.packet[11] = vec_mergel(step3[5], step3[13]); + kernel.packet[12] = vec_mergeh(step3[6], step3[14]); + kernel.packet[13] = vec_mergel(step3[6], step3[14]); + kernel.packet[14] = vec_mergeh(step3[7], step3[15]); + kernel.packet[15] = vec_mergel(step3[7], step3[15]); +} + +template EIGEN_STRONG_INLINE +Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); return vec_sel(elsePacket, thenPacket, mask); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + return pblend4(ifPacket, thenPacket, elsePacket); +} + template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); + return pblend4(ifPacket, thenPacket, elsePacket); +} + +template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) { + Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; + Packet8us mask = reinterpret_cast(vec_cmpeq(select, p8us_ONE)); + Packet8s result = vec_sel(elsePacket, thenPacket, mask); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) { + Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; + Packet8us mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p8us_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) { + return pblend(ifPacket, thenPacket, elsePacket); +} + +template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) { + Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], + ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], + ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; + + Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) { + Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], + ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], + ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; + + Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vec_cts(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { + return vec_ctu(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vec_ctf(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { + return vec_ctf(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8bf& a) { + Packet4f float_even = Bf16ToF32Even(a); + Packet4f float_odd = Bf16ToF32Odd(a); + Packet4ui int_even = pcast(float_even); + Packet4ui int_odd = pcast(float_odd); + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + Packet4ui low_even = pand(int_even, p4ui_low_mask); + Packet4ui low_odd = pand(int_odd, p4ui_low_mask); + + //Check values that are bigger than USHRT_MAX (0xFFFF) + Packet4bi overflow_selector; + if(vec_any_gt(int_even, p4ui_low_mask)){ + overflow_selector = vec_cmpgt(int_even, p4ui_low_mask); + low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector); + } + if(vec_any_gt(int_odd, p4ui_low_mask)){ + overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask); + low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector); + } + + low_odd = plogical_shift_left<16>(low_odd); + + Packet4ui int_final = por(low_even, low_odd); + return reinterpret_cast(int_final); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8us& a) { + //short -> int -> float -> bfloat16 + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + Packet4ui int_cast = reinterpret_cast(a); + Packet4ui int_even = pand(int_cast, p4ui_low_mask); + Packet4ui int_odd = plogical_shift_right<16>(int_cast); + Packet4f float_even = pcast(int_even); + Packet4f float_odd = pcast(int_odd); + return F32ToBf16(float_even, float_odd); +} + + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return reinterpret_cast(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return reinterpret_cast(a); +} + + //---------- double ---------- #ifdef __VSX__ @@ -782,9 +2264,12 @@ typedef __vector __bool long Packet2bl; static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); +static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull }; +static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull }; static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); -static Packet2d p2d_MZERO = { -0.0, -0.0 }; +static Packet2d p2d_MZERO = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #ifdef _BIG_ENDIAN static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -792,16 +2277,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_c static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ONE), reinterpret_cast(p2d_ZERO), 8)); #endif -template Packet2d vec_splat_dbl(Packet2d& a); - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) -{ - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_HI)); -} - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) +template Packet2d vec_splat_dbl(Packet2d& a) { - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_LO)); + return vec_splat(a, index); } template<> struct packet_traits : default_packet_traits @@ -830,12 +2308,13 @@ template<> struct packet_traits : default_packet_traits HasRound = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasNegate = 1, HasBlend = 1 }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { @@ -863,21 +2342,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif + return vec_xl(0, const_cast(from)); // cast needed by Clang } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + vec_xst(from, 0, to); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { @@ -885,28 +2356,32 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return v; } +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(unsigned long from) { + Packet2l v = {static_cast(from), static_cast(from)}; + return reinterpret_cast(v); +} + template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { - a1 = pload(a); - a0 = vec_splat_dbl<0>(a1); - a1 = vec_splat_dbl<1>(a1); - a3 = pload(a+2); - a2 = vec_splat_dbl<0>(a3); - a3 = vec_splat_dbl<1>(a3); + //This way is faster than vec_splat (at least for doubles in Power 9) + a0 = pset1(a[0]); + a1 = pset1(a[1]); + a2 = pset1(a[2]); + a3 = pset1(a[3]); } template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -930,6 +2405,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet2d ret; __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -937,11 +2413,20 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet2d ret; __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { + Packet2d c = reinterpret_cast(vec_cmpge(a,b)); + return vec_nor(c,c); +} + template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } @@ -950,14 +2435,34 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) +{ + Packet2d t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p2ul_SIGN), p2ul_PREV0DOT5)), a); + Packet2d res; + + __asm__("xvrdpiz %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (t)); + + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) +{ + Packet2d res; + + __asm__("xvrdpic %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (a)); + + return res; +} template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD - return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl(0, const_cast(from)); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) @@ -970,13 +2475,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); + EIGEN_DEBUG_UNALIGNED_STORE + vec_xst(from, 0, to); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { @@ -984,6 +2489,177 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +// VSX support varies between different compilers and even different +// versions of the same compiler. For gcc version >= 4.9.3, we can use +// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use +// a slow version that works with older compilers. +// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles +// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 +template<> +inline Packet2l pcast(const Packet2d& x) { +#if EIGEN_GNUC_AT_LEAST(5, 4) || \ + (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) + return vec_cts(x, 0); // TODO: check clang version. +#else + double tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2l l = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return l; +#endif +} + +template<> +inline Packet2d pcast(const Packet2l& x) { + unsigned long long tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2d d = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return d; +} + + +// Packet2l shifts. +// For POWER8 we simply use vec_sr/l. +// +// Things are more complicated for POWER7. There is actually a +// vec_xxsxdi intrinsic but it is not supported by some gcc versions. +// So we need to shift by N % 32 and rearrage bytes. +#ifdef __POWER8_VECTOR__ + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + const Packet2ul shift = { N, N }; + return vec_sl(a, shift); +} + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + const Packet2ul shift = { N, N }; + return vec_sr(a, shift); +} + +#else + +// Shifts [A, B, C, D] to [B, 0, D, 0]. +// Used to implement left shifts for Packet2l. +EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) { + static const Packet16uc perm = { + 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; + #ifdef _BIG_ENDIAN + return vec_perm(p4i_ZERO, a, perm); + #else + return vec_perm(a, p4i_ZERO, perm); + #endif +} + +// Shifts [A, B, C, D] to [0, A, 0, C]. +// Used to implement right shifts for Packet2l. +EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) { + static const Packet16uc perm = { + 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, + 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b }; + #ifdef _BIG_ENDIAN + return vec_perm(p4i_ZERO, a, perm); + #else + return vec_perm(a, p4i_ZERO, perm); + #endif +} + +template +struct plogical_shift_left_impl; + +template +struct plogical_shift_left_impl= 0)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned n = static_cast(N); + const Packet4ui shift = {n, n, n, n}; + const Packet4i ai = reinterpret_cast(a); + static const unsigned m = static_cast(32 - N); + const Packet4ui shift_right = {m, m, m, m}; + const Packet4i out_hi = vec_sl(ai, shift); + const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right)); + return reinterpret_cast(por(out_hi, out_lo)); + } +}; + +template +struct plogical_shift_left_impl= 32)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned m = static_cast(N - 32); + const Packet4ui shift = {m, m, m, m}; + const Packet4i ai = reinterpret_cast(a); + return reinterpret_cast(shift_even_left(vec_sl(ai, shift))); + } +}; + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + return plogical_shift_left_impl::run(a); +} + +template +struct plogical_shift_right_impl; + +template +struct plogical_shift_right_impl= 0)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned n = static_cast(N); + const Packet4ui shift = {n, n, n, n}; + const Packet4i ai = reinterpret_cast(a); + static const unsigned m = static_cast(32 - N); + const Packet4ui shift_left = {m, m, m, m}; + const Packet4i out_lo = vec_sr(ai, shift); + const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left)); + return reinterpret_cast(por(out_hi, out_lo)); + } +}; + +template +struct plogical_shift_right_impl= 32)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned m = static_cast(N - 32); + const Packet4ui shift = {m, m, m, m}; + const Packet4i ai = reinterpret_cast(a); + return reinterpret_cast(shift_odd_right(vec_sr(ai, shift))); + } +}; + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + return plogical_shift_right_impl::run(a); +} +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet2d max_exponent = pset1(2099.0); + const Packet2l e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply: + const Packet2l bias = { 1023, 1023 }; + Packet2l b = plogical_shift_right<2>(e); // floor(e/4) + Packet2d c = reinterpret_cast(plogical_shift_left<52>(b + bias)); + Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + c = reinterpret_cast(plogical_shift_left<52>(b + bias)); // 2^(e - 3b) + out = pmul(out, c); // a * 2^e + return out; +} + + +// Extract exponent without existence of Packet2l. +template<> +EIGEN_STRONG_INLINE +Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) { + return pcast(plogical_shift_right<52>(reinterpret_cast(pabs(a)))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp (const Packet2d& a, Packet2d& exponent) { + return pfrexp_generic(a, exponent); +} + template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { Packet2d b, sum; @@ -992,20 +2668,6 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) return pfirst(sum); } -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - Packet2d v[2], sum; - v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); - v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); - -#ifdef _BIG_ENDIAN - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); -#else - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[1]), reinterpret_cast(v[0]), 8)); -#endif - - return sum; -} // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) @@ -1025,20 +2687,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset == 1) -#ifdef _BIG_ENDIAN - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); -#else - first = reinterpret_cast(vec_sld(reinterpret_cast(second), reinterpret_cast(first), 8)); -#endif - } -}; - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet2d t0, t1; @@ -1053,6 +2701,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); return vec_sel(elsePacket, thenPacket, mask); } + + #endif // __VSX__ } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/CUDA/Complex.h b/inst/include/Eigen/src/Core/arch/CUDA/Complex.h index 9c253650..deb4c869 100644 --- a/inst/include/Eigen/src/Core/arch/CUDA/Complex.h +++ b/inst/include/Eigen/src/Core/arch/CUDA/Complex.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2021 C. Antonio Sanchez // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -11,93 +12,247 @@ #define EIGEN_COMPLEX_CUDA_H // clang-format off +// Many std::complex methods such as operator+, operator-, operator* and +// operator/ are not constexpr. Due to this, GCC and older versions of clang do +// not treat them as device functions and thus Eigen functors making use of +// these operators fail to compile. Here, we manually specialize these +// operators and functors for complex types when building for CUDA to enable +// their use on-device. + +#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE) + +// ICC already specializes std::complex and std::complex +// operators, preventing us from making them device functions here. +// This will lead to silent runtime errors if the operators are used on device. +// +// To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_ +// prior to first inclusion of . This prevents ICC from adding +// its own specializations, so our custom ones below can be used instead. +#if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_)) + +// Import Eigen's internal operator specializations. +#define EIGEN_USING_STD_COMPLEX_OPERATORS \ + using Eigen::complex_operator_detail::operator+; \ + using Eigen::complex_operator_detail::operator-; \ + using Eigen::complex_operator_detail::operator*; \ + using Eigen::complex_operator_detail::operator/; \ + using Eigen::complex_operator_detail::operator+=; \ + using Eigen::complex_operator_detail::operator-=; \ + using Eigen::complex_operator_detail::operator*=; \ + using Eigen::complex_operator_detail::operator/=; \ + using Eigen::complex_operator_detail::operator==; \ + using Eigen::complex_operator_detail::operator!=; namespace Eigen { -namespace internal { +// Specialized std::complex overloads. +namespace complex_operator_detail { -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_multiply(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + return std::complex( + a_real * b_real - a_imag * b_imag, + a_imag * b_real + a_real * b_imag); +} -// Many std::complex methods such as operator+, operator-, operator* and -// operator/ are not constexpr. Due to this, clang does not treat them as device -// functions and thus Eigen functors making use of these operators fail to -// compile. Here, we manually specialize these functors for complex types when -// building for CUDA to avoid non-constexpr methods. - -// Sum -template struct scalar_sum_op, const std::complex > : binary_op_base, const std::complex > { - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - return std::complex(numext::real(a) + numext::real(b), - numext::imag(a) + numext::imag(b)); - } -}; - -template struct scalar_sum_op, std::complex > : scalar_sum_op, const std::complex > {}; - - -// Difference -template struct scalar_difference_op, const std::complex > : binary_op_base, const std::complex > { - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - return std::complex(numext::real(a) - numext::real(b), - numext::imag(a) - numext::imag(b)); - } -}; - -template struct scalar_difference_op, std::complex > : scalar_difference_op, const std::complex > {}; - - -// Product -template struct scalar_product_op, const std::complex > : binary_op_base, const std::complex > { - enum { - Vectorizable = packet_traits>::HasMul - }; - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - const T a_real = numext::real(a); - const T a_imag = numext::imag(a); - const T b_real = numext::real(b); - const T b_imag = numext::imag(b); - return std::complex(a_real * b_real - a_imag * b_imag, - a_real * b_imag + a_imag * b_real); - } -}; - -template struct scalar_product_op, std::complex > : scalar_product_op, const std::complex > {}; - - -// Quotient -template struct scalar_quotient_op, const std::complex > : binary_op_base, const std::complex > { - enum { - Vectorizable = packet_traits>::HasDiv - }; - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - const T a_real = numext::real(a); - const T a_imag = numext::imag(a); - const T b_real = numext::real(b); - const T b_imag = numext::imag(b); - const T norm = T(1) / (b_real * b_real + b_imag * b_imag); - return std::complex((a_real * b_real + a_imag * b_imag) * norm, - (a_imag * b_real - a_real * b_imag) * norm); - } -}; - -template struct scalar_quotient_op, std::complex > : scalar_quotient_op, const std::complex > {}; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_divide_fast(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + const T norm = (b_real * b_real + b_imag * b_imag); + return std::complex((a_real * b_real + a_imag * b_imag) / norm, + (a_imag * b_real - a_real * b_imag) / norm); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_divide_stable(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf), + // guards against over/under-flow. + const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real); + const T rscale = scale_imag ? T(1) : b_real / b_imag; + const T iscale = scale_imag ? b_imag / b_real : T(1); + const T denominator = b_real * rscale + b_imag * iscale; + return std::complex((a_real * rscale + a_imag * iscale) / denominator, + (a_imag * rscale - a_real * iscale) / denominator); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_divide(const std::complex& a, const std::complex& b) { +#if EIGEN_FAST_MATH + return complex_divide_fast(a, b); +#else + return complex_divide_stable(a, b); #endif +} + +// NOTE: We cannot specialize compound assignment operators with Scalar T, +// (i.e. operator@=(const T&), for @=+,-,*,/) +// since they are already specialized for float/double/long double within +// the standard header. We also do not specialize the stream +// operators. +#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T) \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const std::complex& a) { return a; } \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const std::complex& a) { \ + return std::complex(-numext::real(a), -numext::imag(a)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const std::complex& a, const std::complex& b) { \ + return std::complex(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) + b, numext::imag(a)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const T& a, const std::complex& b) { \ + return std::complex(a + numext::real(b), numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const std::complex& a, const std::complex& b) { \ + return std::complex(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) - b, numext::imag(a)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const T& a, const std::complex& b) { \ + return std::complex(a - numext::real(b), -numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator*(const std::complex& a, const std::complex& b) { \ + return complex_multiply(a, b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator*(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) * b, numext::imag(a) * b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator*(const T& a, const std::complex& b) { \ + return std::complex(a * numext::real(b), a * numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator/(const std::complex& a, const std::complex& b) { \ + return complex_divide(a, b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator/(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) / b, numext::imag(a) / b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator/(const T& a, const std::complex& b) { \ + return complex_divide(std::complex(a, 0), b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator+=(std::complex& a, const std::complex& b) { \ + numext::real_ref(a) += numext::real(b); \ + numext::imag_ref(a) += numext::imag(b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator-=(std::complex& a, const std::complex& b) { \ + numext::real_ref(a) -= numext::real(b); \ + numext::imag_ref(a) -= numext::imag(b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator*=(std::complex& a, const std::complex& b) { \ + a = complex_multiply(a, b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator/=(std::complex& a, const std::complex& b) { \ + a = complex_divide(a, b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator==(const std::complex& a, const std::complex& b) { \ + return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator==(const std::complex& a, const T& b) { \ + return numext::real(a) == b && numext::imag(a) == 0; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator==(const T& a, const std::complex& b) { \ + return a == numext::real(b) && 0 == numext::imag(b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator!=(const std::complex& a, const std::complex& b) { \ + return !(a == b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator!=(const std::complex& a, const T& b) { \ + return !(a == b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator!=(const T& a, const std::complex& b) { \ + return !(a == b); \ +} + +// Do not specialize for long double, since that reduces to double on device. +EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float) +EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double) + +#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS + + +} // namespace complex_operator_detail + +EIGEN_USING_STD_COMPLEX_OPERATORS + +namespace numext { +EIGEN_USING_STD_COMPLEX_OPERATORS +} // namespace numext + +namespace internal { +EIGEN_USING_STD_COMPLEX_OPERATORS + +} // namespace internal +} // namespace Eigen -} // end namespace internal +#endif // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_) -} // end namespace Eigen +#endif // EIGEN_CUDACC && EIGEN_GPU_COMPILE_PHASE -#endif // EIGEN_COMPLEX_CUDA_H +#endif // EIGEN_COMPLEX_CUDA_H diff --git a/inst/include/Eigen/src/Core/arch/CUDA/Half.h b/inst/include/Eigen/src/Core/arch/CUDA/Half.h deleted file mode 100644 index c65724d0..00000000 --- a/inst/include/Eigen/src/Core/arch/CUDA/Half.h +++ /dev/null @@ -1,677 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// -// The conversion routines are Copyright (c) Fabian Giesen, 2016. -// The original license follows: -// -// Copyright (c) Fabian Giesen, 2016 -// All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted. -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with -// operator overloads such that it behaves basically as an arithmetic -// type. It will be quite slow on CPUs (so it is recommended to stay -// in float32_bits for CPUs, except for simple parameter conversions, I/O -// to disk and the likes), but fast on GPUs. - - -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H - -#if __cplusplus > 199711L -#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() -#else -#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() -#endif - -#include - -namespace Eigen { - -struct half; - -namespace half_impl { - -#if !defined(EIGEN_HAS_CUDA_FP16) -// Make our own __half_raw definition that is similar to CUDA's. -struct __half_raw { - EIGEN_DEVICE_FUNC __half_raw() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} - unsigned short x; -}; -#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 -// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw -typedef __half __half_raw; -#endif - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); - -struct half_base : public __half_raw { - EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} - EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} -#endif -}; - -} // namespace half_impl - -// Class definition. -struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) - typedef half_impl::__half_raw __half_raw; - #endif - - EIGEN_DEVICE_FUNC half() {} - - EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} - EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} -#endif - - explicit EIGEN_DEVICE_FUNC half(bool b) - : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} - template - explicit EIGEN_DEVICE_FUNC half(const T& val) - : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} - explicit EIGEN_DEVICE_FUNC half(float f) - : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} - - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { - // +0.0 and -0.0 become false, everything else becomes true. - return (x & 0x7fff) != 0; - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { - return static_cast(half_impl::half_to_float(*this)); - } -#if EIGEN_HAS_CXX11 - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { - return static_cast(half_to_float(*this)); - } -#endif - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { - return half_impl::half_to_float(*this); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { - return static_cast(half_impl::half_to_float(*this)); - } - - EIGEN_DEVICE_FUNC half& operator=(const half& other) { - x = other.x; - return *this; - } -}; - -} // end namespace Eigen - -namespace std { -template<> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } - static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } - static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } - static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } - static Eigen::half round_error() { return Eigen::half(0.5); } - static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } - static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } -}; - -// If std::numeric_limits is specialized, should also specialize -// std::numeric_limits, std::numeric_limits, and -// std::numeric_limits -// https://stackoverflow.com/a/16519653/ -template<> -struct numeric_limits : numeric_limits {}; -template<> -struct numeric_limits : numeric_limits {}; -template<> -struct numeric_limits : numeric_limits {}; -} // end namespace std - -namespace Eigen { - -namespace half_impl { - -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - -// Intrinsics for native fp16 support. Note that on current hardware, -// these are no faster than float32_bits arithmetic (you need to use the half2 -// versions to get the ALU speed increased), but you do save the -// conversion steps back and forth. - -EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { - return __hadd(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { - return __hmul(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { - return __hsub(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -} -EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { - return __hneg(a); -} -EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { - a = a + b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { - a = a * b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { - a = a - b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { - a = a / b; - return a; -} -EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { - return __heq(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { - return __hne(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { - return __hlt(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { - return __hle(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { - return __hgt(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { - return __hge(a, b); -} - -#else // Emulate support for half floats - -// Definitions for CPUs and older CUDA, mostly working through conversion -// to/from float32_bits. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { - return half(float(a) + float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { - return half(float(a) * float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { - return half(float(a) - float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { - return half(float(a) / float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { - half result; - result.x = a.x ^ 0x8000; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { - a = half(float(a) + float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { - a = half(float(a) * float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { - a = half(float(a) - float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { - a = half(float(a) / float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { - return numext::equal_strict(float(a),float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { - return numext::not_equal_strict(float(a), float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { - return float(a) < float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { - return float(a) <= float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { - return float(a) > float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { - return float(a) >= float(b); -} - -#endif // Emulate support for half floats - -// Division by an index. Do it in full float precision to avoid accuracy -// issues in converting the denominator to half. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { - return half(static_cast(a) / static_cast(b)); -} - -// Conversion routines, including fallbacks for the host or older CUDA. -// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of -// these in hardware. If we need more performance on older/other CPUs, they are -// also possible to vectorize directly. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { - __half_raw h; - h.x = x; - return h; -} - -union float32_bits { - unsigned int u; - float f; -}; - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - __half tmp_ff = __float2half(ff); - return *(__half_raw*)&tmp_ff; - -#elif defined(EIGEN_HAS_FP16_C) - __half_raw h; - h.x = _cvtss_sh(ff, 0); - return h; - -#else - float32_bits f; f.f = ff; - - const float32_bits f32infty = { 255 << 23 }; - const float32_bits f16max = { (127 + 16) << 23 }; - const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; - unsigned int sign_mask = 0x80000000u; - __half_raw o; - o.x = static_cast(0x0u); - - unsigned int sign = f.u & sign_mask; - f.u ^= sign; - - // NOTE all the integer compares in this function can be safely - // compiled into signed compares since all operands are below - // 0x80000000. Important if you want fast straight SSE2 code - // (since there's no unsigned PCMPGTD). - - if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) - o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf - } else { // (De)normalized number or zero - if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero - // use a magic value to align our 10 mantissa bits at the bottom of - // the float. as long as FP addition is round-to-nearest-even this - // just works. - f.f += denorm_magic.f; - - // and one integer subtract of the bias later, we have our final float! - o.x = static_cast(f.u - denorm_magic.u); - } else { - unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - - // update exponent, rounding bias part 1 - f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; - // rounding bias part 2 - f.u += mant_odd; - // take the bits! - o.x = static_cast(f.u >> 13); - } - } - - o.x |= static_cast(sign >> 16); - return o; -#endif -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - return __half2float(h); - -#elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); - -#else - const float32_bits magic = { 113 << 23 }; - const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - float32_bits o; - - o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits - unsigned int exp = shifted_exp & o.u; // just the exponent - o.u += (127 - 15) << 23; // exponent adjust - - // handle exponent special cases - if (exp == shifted_exp) { // Inf/NaN? - o.u += (128 - 16) << 23; // extra exp adjust - } else if (exp == 0) { // Zero/Denormal? - o.u += 1 << 23; // extra exp adjust - o.f -= magic.f; // renormalize - } - - o.u |= (h.x & 0x8000) << 16; // sign bit - return o.f; -#endif -} - -// --- standard functions --- - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { - return (a.x & 0x7fff) == 0x7c00; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return __hisnan(a); -#else - return (a.x & 0x7fff) > 0x7c00; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { - return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { - half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 - return half(hexp(a)); -#else - return half(::expf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return half(::hlog(a)); -#else - return half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { - return half(numext::log1p(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { - return half(::log10f(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 - return half(hsqrt(a)); -#else - return half(::sqrtf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { - return half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { - return half(::sinf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { - return half(::cosf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { - return half(::tanf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { - return half(::tanhf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 - return half(hfloor(a)); -#else - return half(::floorf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 - return half(hceil(a)); -#else - return half(::ceilf(float(a))); -#endif -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return __hlt(b, a) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f2 < f1 ? b : a; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return __hlt(a, b) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f1 < f2 ? b : a; -#endif -} - -EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { - os << static_cast(v); - return os; -} - -} // end namespace half_impl - -// import Eigen::half_impl::half into Eigen namespace -// using half_impl::half; - -namespace internal { - -template<> -struct random_default_impl -{ - static inline half run(const half& x, const half& y) - { - return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); - } - static inline half run() - { - return run(half(-1.f), half(1.f)); - } -}; - -template<> struct is_arithmetic { enum { value = true }; }; - -} // end namespace internal - -template<> struct NumTraits - : GenericNumTraits -{ - enum { - IsSigned = true, - IsInteger = false, - IsComplex = false, - RequireInitialization = false - }; - - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { - return half_impl::raw_uint16_to_half(0x0800); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { - return half_impl::raw_uint16_to_half(0x7bff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { - return half_impl::raw_uint16_to_half(0xfbff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { - return half_impl::raw_uint16_to_half(0x7c00); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { - return half_impl::raw_uint16_to_half(0x7c01); - } -}; - -} // end namespace Eigen - -// C-like standard mathematical functions and trancendentals. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { - Eigen::half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { - return Eigen::half(::expf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return Eigen::half(::hlog(a)); -#else - return Eigen::half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { - return Eigen::half(::sqrtf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { - return Eigen::half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { - return Eigen::half(::floorf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { - return Eigen::half(::ceilf(float(a))); -} - -namespace std { - -#if __cplusplus > 199711L -template <> -struct hash { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { - return static_cast(a.x); - } -}; -#endif - -} // end namespace std - - -// Add the missing shfl_xor intrinsic -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 -__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - #if EIGEN_CUDACC_VER < 90000 - return static_cast(__shfl_xor(static_cast(var), laneMask, width)); - #else - return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); - #endif -} -#endif - -// ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { - return Eigen::half_impl::raw_uint16_to_half( - __ldg(reinterpret_cast(ptr))); -} -#endif - - -#if defined(EIGEN_CUDA_ARCH) -namespace Eigen { -namespace numext { - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isnan)(const Eigen::half& h) { - return (half_impl::isnan)(h); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isinf)(const Eigen::half& h) { - return (half_impl::isinf)(h); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isfinite)(const Eigen::half& h) { - return (half_impl::isfinite)(h); -} - -} // namespace Eigen -} // namespace numext -#endif - -#endif // EIGEN_HALF_CUDA_H diff --git a/inst/include/Eigen/src/Core/arch/CUDA/PacketMath.h b/inst/include/Eigen/src/Core/arch/CUDA/PacketMath.h deleted file mode 100644 index 4dda6318..00000000 --- a/inst/include/Eigen/src/Core/arch/CUDA/PacketMath.h +++ /dev/null @@ -1,333 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_CUDA_H -#define EIGEN_PACKET_MATH_CUDA_H - -namespace Eigen { - -namespace internal { - -// Make sure this is only available when targeting a GPU: we don't want to -// introduce conflicts between these packet_traits definitions and the ones -// we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef float4 type; - typedef float4 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 0, - - HasDiv = 1, - HasSin = 0, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasIGamma = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - }; -}; - -template<> struct packet_traits : default_packet_traits -{ - typedef double2 type; - typedef double2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasIGamma = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - }; -}; - - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { - return make_float4(from, from, from, from); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { - return make_double2(from, from); -} - - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { - return make_float4(a, a+1, a+2, a+3); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { - return make_double2(a, a+1); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { - return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { - return make_double2(a.x+b.x, a.y+b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { - return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { - return make_double2(a.x-b.x, a.y-b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { - return make_float4(-a.x, -a.y, -a.z, -a.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { - return make_double2(-a.x, -a.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { - return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { - return make_double2(a.x*b.x, a.y*b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { - return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { - return make_double2(a.x/b.x, a.y/b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { - return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { - return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { - return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { - return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { - return make_float4(from[0], from[1], from[2], from[3]); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { - return make_double2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { - return make_float4(from[0], from[0], from[1], from[1]); -} -template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { - return make_double2(from[0], from[0]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { - to[0] = from.x; - to[1] = from.y; - to[2] = from.z; - to[3] = from.w; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { - to[0] = from.x; - to[1] = from.y; -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const float4*)from); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const double2*)from); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return make_double2(__ldg(from+0), __ldg(from+1)); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { - return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { - return make_double2(from[0*stride], from[1*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; - to[stride*2] = from.z; - to[stride*3] = from.w; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; -} - -template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { - return a.x; -} -template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { - return a.x; -} - -template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { - return a.x + a.y + a.z + a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { - return a.x + a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { - return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { - return fmax(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { - return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { - return fmin(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { - return a.x * a.y * a.z * a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { - return a.x * a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { - return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { - return make_double2(fabs(a.x), fabs(a.y)); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; - - tmp = kernel.packet[0].z; - kernel.packet[0].z = kernel.packet[2].x; - kernel.packet[2].x = tmp; - - tmp = kernel.packet[0].w; - kernel.packet[0].w = kernel.packet[3].x; - kernel.packet[3].x = tmp; - - tmp = kernel.packet[1].z; - kernel.packet[1].z = kernel.packet[2].y; - kernel.packet[2].y = tmp; - - tmp = kernel.packet[1].w; - kernel.packet[1].w = kernel.packet[3].y; - kernel.packet[3].y = tmp; - - tmp = kernel.packet[2].w; - kernel.packet[2].w = kernel.packet[3].z; - kernel.packet[3].z = tmp; -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; -} - -#endif - -} // end namespace internal - -} // end namespace Eigen - - -#endif // EIGEN_PACKET_MATH_CUDA_H diff --git a/inst/include/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/inst/include/Eigen/src/Core/arch/CUDA/PacketMathHalf.h deleted file mode 100644 index f749c573..00000000 --- a/inst/include/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ /dev/null @@ -1,1124 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H -#define EIGEN_PACKET_MATH_HALF_CUDA_H - - -namespace Eigen { -namespace internal { - -// Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef half2 type; - typedef half2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - HasAdd = 1, - HasMul = 1, - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasLog = 1, - HasLog1p = 1 - }; -}; - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; - -template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { - return __half2half2(from); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { - return *reinterpret_cast(from); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return __halves2half2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return __halves2half2(from[0], from[0]); -} - -template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { - *reinterpret_cast(to) = from; -} - -template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = __low2half(from); - to[1] = __high2half(from); -} - -template<> - __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 - return __ldg((const half2*)from); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif -} - -template<> -__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 - return __halves2half2(__ldg(from+0), __ldg(from+1)); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); -} - -template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return __low2half(a); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half2 result; - unsigned temp = *(reinterpret_cast(&(a))); - *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; - return result; -} - - -__device__ EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __half a1 = __low2half(kernel.packet[0]); - __half a2 = __high2half(kernel.packet[0]); - __half b1 = __low2half(kernel.packet[1]); - __half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if __CUDA_ARCH__ >= 530 - return __halves2half2(a, __hadd(a, __float2half(1.0f))); -#else - float f = __half2float(a) + 1.0f; - return __halves2half2(a, __float2half(f)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hsub2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 - b1; - float r2 = a2 - b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hneg2(a); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return __floats2half2_rn(-a1, -a2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } - -template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if __CUDA_ARCH__ >= 530 - return __hfma2(a, b, c); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float c1 = __low2float(c); - float c2 = __high2float(c); - float r1 = a1 * b1 + c1; - float r2 = a2 * b2 + c2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hadd(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half_rn(a1 + a2)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if __CUDA_ARCH__ >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hgt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 > a2 ? __low2half(a) : __high2half(a); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if __CUDA_ARCH__ >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hlt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 < a2 ? __low2half(a) : __high2half(a); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hmul(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half_rn(a1 * a2)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = log1pf(a1); - float r2 = log1pf(a2); - return __floats2half2_rn(r1, r2); -} - -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 - -template<> __device__ EIGEN_STRONG_INLINE -half2 plog(const half2& a) { - return h2log(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 pexp(const half2& a) { - return h2exp(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 psqrt(const half2& a) { - return h2sqrt(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 prsqrt(const half2& a) { - return h2rsqrt(a); -} - -#else - -template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = logf(a1); - float r2 = logf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expf(a1); - float r2 = expf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = sqrtf(a1); - float r2 = sqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = rsqrtf(a1); - float r2 = rsqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -#endif - -#elif defined EIGEN_VECTORIZE_AVX512 - -typedef struct { - __m256i x; -} Packet16h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet16h type; - // There is no half-size packet for Packet16h. - typedef Packet16h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; - -template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { - Packet16h result; - result.x = _mm256_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_load_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_loadu_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - _mm256_store_si256((__m256i*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - _mm256_storeu_si256((__m256i*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE Packet16h -ploadquad(const Eigen::half* from) { - Packet16h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm512_cvtph_ps(a.x); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet16h result; - result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - Packet16h result; - result.x = _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux(from_float)); -} - -template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) -{ - Packet16h result; - result.x = _mm256_set_epi16( - from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, - from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, - from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, - from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) -{ - EIGEN_ALIGN64 half aux[16]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; - to[stride*8].x = aux[8].x; - to[stride*9].x = aux[9].x; - to[stride*10].x = aux[10].x; - to[stride*11].x = aux[11].x; - to[stride*12].x = aux[12].x; - to[stride*13].x = aux[13].x; - to[stride*14].x = aux[14].x; - to[stride*15].x = aux[15].x; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m256i a = kernel.packet[0].x; - __m256i b = kernel.packet[1].x; - __m256i c = kernel.packet[2].x; - __m256i d = kernel.packet[3].x; - __m256i e = kernel.packet[4].x; - __m256i f = kernel.packet[5].x; - __m256i g = kernel.packet[6].x; - __m256i h = kernel.packet[7].x; - __m256i i = kernel.packet[8].x; - __m256i j = kernel.packet[9].x; - __m256i k = kernel.packet[10].x; - __m256i l = kernel.packet[11].x; - __m256i m = kernel.packet[12].x; - __m256i n = kernel.packet[13].x; - __m256i o = kernel.packet[14].x; - __m256i p = kernel.packet[15].x; - - __m256i ab_07 = _mm256_unpacklo_epi16(a, b); - __m256i cd_07 = _mm256_unpacklo_epi16(c, d); - __m256i ef_07 = _mm256_unpacklo_epi16(e, f); - __m256i gh_07 = _mm256_unpacklo_epi16(g, h); - __m256i ij_07 = _mm256_unpacklo_epi16(i, j); - __m256i kl_07 = _mm256_unpacklo_epi16(k, l); - __m256i mn_07 = _mm256_unpacklo_epi16(m, n); - __m256i op_07 = _mm256_unpacklo_epi16(o, p); - - __m256i ab_8f = _mm256_unpackhi_epi16(a, b); - __m256i cd_8f = _mm256_unpackhi_epi16(c, d); - __m256i ef_8f = _mm256_unpackhi_epi16(e, f); - __m256i gh_8f = _mm256_unpackhi_epi16(g, h); - __m256i ij_8f = _mm256_unpackhi_epi16(i, j); - __m256i kl_8f = _mm256_unpackhi_epi16(k, l); - __m256i mn_8f = _mm256_unpackhi_epi16(m, n); - __m256i op_8f = _mm256_unpackhi_epi16(o, p); - - __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); - __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); - __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); - __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); - __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); - __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); - __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); - __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); - - __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); - __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); - __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); - __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); - __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); - __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); - __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); - __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); - - __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); - __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); - __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); - __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); - __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); - __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); - __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); - __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); - __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); - __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); - __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); - __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); - __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); - __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); - __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); - __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); - - // NOTE: no unpacklo/hi instr in this case, so using permute instr. - __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); - __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); - - kernel.packet[0].x = a_p_0; - kernel.packet[1].x = a_p_1; - kernel.packet[2].x = a_p_2; - kernel.packet[3].x = a_p_3; - kernel.packet[4].x = a_p_4; - kernel.packet[5].x = a_p_5; - kernel.packet[6].x = a_p_6; - kernel.packet[7].x = a_p_7; - kernel.packet[8].x = a_p_8; - kernel.packet[9].x = a_p_9; - kernel.packet[10].x = a_p_a; - kernel.packet[11].x = a_p_b; - kernel.packet[12].x = a_p_c; - kernel.packet[13].x = a_p_d; - kernel.packet[14].x = a_p_e; - kernel.packet[15].x = a_p_f; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[8][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - pstore(in[4], kernel.packet[4]); - pstore(in[5], kernel.packet[5]); - pstore(in[6], kernel.packet[6]); - pstore(in[7], kernel.packet[7]); - - EIGEN_ALIGN64 half out[8][16]; - - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 8; ++j) { - out[i][j+8] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); - kernel.packet[4] = pload(out[4]); - kernel.packet[5] = pload(out[5]); - kernel.packet[6] = pload(out[6]); - kernel.packet[7] = pload(out[7]); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[4][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN64 half out[4][16]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][4*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][4*i+1]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+8] = in[j][4*i+2]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+12] = in[j][4*i+3]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -#elif defined EIGEN_VECTORIZE_AVX - -typedef struct { - __m128i x; -} Packet8h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet8h type; - // There is no half-size packet for Packet8h. - typedef Packet8h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; - -template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { - Packet8h result; - result.x = _mm_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { - Packet8h result; - result.x = _mm_load_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { - Packet8h result; - result.x = _mm_loadu_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { - _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE Packet8h -ploadquad(const Eigen::half* from) { - Packet8h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm256_cvtph_ps(a.x); -#else - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - - return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet8h result; - result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN32 float aux[8]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - Eigen::half h4(aux[4]); - Eigen::half h5(aux[5]); - Eigen::half h6(aux[6]); - Eigen::half h7(aux[7]); - - Packet8h result; - result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) -{ - Packet8h result; - result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) -{ - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_max(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_min(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_mul(af); - return Eigen::half(reduced); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m128i a = kernel.packet[0].x; - __m128i b = kernel.packet[1].x; - __m128i c = kernel.packet[2].x; - __m128i d = kernel.packet[3].x; - __m128i e = kernel.packet[4].x; - __m128i f = kernel.packet[5].x; - __m128i g = kernel.packet[6].x; - __m128i h = kernel.packet[7].x; - - __m128i a03b03 = _mm_unpacklo_epi16(a, b); - __m128i c03d03 = _mm_unpacklo_epi16(c, d); - __m128i e03f03 = _mm_unpacklo_epi16(e, f); - __m128i g03h03 = _mm_unpacklo_epi16(g, h); - __m128i a47b47 = _mm_unpackhi_epi16(a, b); - __m128i c47d47 = _mm_unpackhi_epi16(c, d); - __m128i e47f47 = _mm_unpackhi_epi16(e, f); - __m128i g47h47 = _mm_unpackhi_epi16(g, h); - - __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); - __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); - __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); - __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); - __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); - __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); - __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); - __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); - - __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); - __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); - __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); - __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); - __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); - __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); - __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); - __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); - - kernel.packet[0].x = a0b0c0d0e0f0g0h0; - kernel.packet[1].x = a1b1c1d1e1f1g1h1; - kernel.packet[2].x = a2b2c2d2e2f2g2h2; - kernel.packet[3].x = a3b3c3d3e3f3g3h3; - kernel.packet[4].x = a4b4c4d4e4f4g4h4; - kernel.packet[5].x = a5b5c5d5e5f5g5h5; - kernel.packet[6].x = a6b6c6d6e6f6g6h6; - kernel.packet[7].x = a7b7c7d7e7f7g7h7; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN32 Eigen::half in[4][8]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN32 Eigen::half out[4][8]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -typedef struct { - __m64 x; -} Packet4h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet4h type; - // There is no half-size packet for Packet4h. - typedef Packet4h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; - -template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { - Packet4h result; - result.x = _mm_set1_pi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); -} - -template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha + hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha * hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE Packet4h -ploadquad(const Eigen::half* from) { - return pset1(*from); -} - -template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) -{ - Packet4h result; - result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) -{ - __int64_t a = _mm_cvtm64_si64(from.x); - to[stride*0].x = static_cast(a); - to[stride*1].x = static_cast(a >> 16); - to[stride*2].x = static_cast(a >> 32); - to[stride*3].x = static_cast(a >> 48); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); - __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); - - kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); - kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); - kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); - kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); -} - -#endif - -} -} - -#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/inst/include/Eigen/src/Core/arch/CUDA/TypeCasting.h b/inst/include/Eigen/src/Core/arch/CUDA/TypeCasting.h deleted file mode 100644 index aa5fbce8..00000000 --- a/inst/include/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ /dev/null @@ -1,212 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_CUDA_H -#define EIGEN_TYPE_CASTING_CUDA_H - -namespace Eigen { - -namespace internal { - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(a); - #else - return Eigen::half(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(static_cast(a)); - #else - return Eigen::half(static_cast(a)); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef float result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __half2float(a); - #else - return static_cast(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - - -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 2, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { - float2 r1 = __half22float2(a); - float2 r2 = __half22float2(b); - return make_float4(r1.x, r1.y, r2.x, r2.y); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 2 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { - // Simply discard the second half of the input - return __floats2half2_rn(a.x, a.y); -} - -#elif defined EIGEN_VECTORIZE_AVX512 -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { - return float2half(a); -} - -#elif defined EIGEN_VECTORIZE_AVX - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { - return float2half(a); -} - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - Eigen::half h = raw_uint16_to_half(static_cast(a64)); - float f1 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 16)); - float f2 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 32)); - float f3 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 48)); - float f4 = static_cast(h); - return _mm_set_ps(f4, f3, f2, f1); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { - EIGEN_ALIGN16 float aux[4]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - - Packet4h result; - result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); - return result; -} - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/inst/include/Eigen/src/Core/arch/Default/BFloat16.h b/inst/include/Eigen/src/Core/arch/Default/BFloat16.h new file mode 100644 index 00000000..1c28f4f9 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/Default/BFloat16.h @@ -0,0 +1,700 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef EIGEN_BFLOAT16_H +#define EIGEN_BFLOAT16_H + +#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD) \ + template <> \ + EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED \ + PACKET_BF16 METHOD(const PACKET_BF16& _x) { \ + return F32ToBf16(METHOD(Bf16ToF32(_x))); \ + } + +namespace Eigen { + +struct bfloat16; + +namespace bfloat16_impl { + +// Make our own __bfloat16_raw definition. +struct __bfloat16_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {} + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {} + unsigned short value; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value); +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +// Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying: +// > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h); + +struct bfloat16_base : public __bfloat16_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {} +}; + +} // namespace bfloat16_impl + +// Class definition. +struct bfloat16 : public bfloat16_impl::bfloat16_base { + + typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {} + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {} + + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b) + : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {} + + template + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne::value>(static_cast(val))) {} + + explicit EIGEN_DEVICE_FUNC bfloat16(float f) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne(f)) {} + + // Following the convention of numpy, converting between complex and + // float will lead to loss of imag value. + template + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex& val) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne(static_cast(val.real()))) {} + + EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. + return bfloat16_impl::bfloat16_to_float(*this); + } +}; +} // namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = std::denorm_absent; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = numeric_limits::round_style; + static const bool is_iec559 = false; + static const bool is_bounded = true; + static const bool is_modulo = false; + static const int digits = 8; + static const int digits10 = 2; + static const int max_digits10 = 4; + static const int radix = 2; + static const int min_exponent = numeric_limits::min_exponent; + static const int min_exponent10 = numeric_limits::min_exponent10; + static const int max_exponent = numeric_limits::max_exponent; + static const int max_exponent10 = numeric_limits::max_exponent10; + static const bool traps = numeric_limits::traps; + static const bool tinyness_before = numeric_limits::tinyness_before; + + static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); } + static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); } + static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); } + static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); } + static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); } + static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); } + static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); } + static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); } + static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // namespace std + +namespace Eigen { + +namespace bfloat16_impl { + +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +// We need to provide emulated *host-side* BF16 operators for clang. +#pragma push_macro("EIGEN_DEVICE_FUNC") +#undef EIGEN_DEVICE_FUNC +#if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16) +#define EIGEN_DEVICE_FUNC __host__ +#else // both host and device need emulated ops. +#define EIGEN_DEVICE_FUNC __host__ __device__ +#endif +#endif + +// Definitions for CPUs, mostly working through conversion +// to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) { + return bfloat16(float(a) + static_cast(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) { + return bfloat16(static_cast(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) { + bfloat16 result; + result.value = a.value ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) { + a += bfloat16(1); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) { + a -= bfloat16(1); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) { + bfloat16 original_value = a; + ++a; + return original_value; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) { + bfloat16 original_value = a; + --a; + return original_value; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) { + return float(a) >= float(b); +} + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +#pragma pop_macro("EIGEN_DEVICE_FUNC") +#endif +#endif // Emulate support for bfloat16 floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to bfloat16. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) { + return bfloat16(static_cast(a) / static_cast(b)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) { + __bfloat16_raw output; + if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) { + output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; + return output; + } + const uint16_t* p = reinterpret_cast(&v); +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + output.value = p[0]; +#else + output.value = p[1]; +#endif + return output; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) { + return __bfloat16_raw(value); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) { + return bf.value; +} + +// float_to_bfloat16_rtne template specialization that does not make any +// assumption about the value of its function argument (ff). +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) + // Nothing to do here +#else + __bfloat16_raw output; + + if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) { + // If the value is a NaN, squash it to a qNaN with msb of fraction set, + // this makes sure after truncation we don't end up with an inf. + // + // qNaN magic: All exponent bits set + most significant bit of fraction + // set. + output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0; + } else { + // Fast rounding algorithm that rounds a half value to nearest even. This + // reduces expected error when we convert a large number of floats. Here + // is how it works: + // + // Definitions: + // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits + // with the following tags: + // + // Sign | Exp (8 bits) | Frac (23 bits) + // S EEEEEEEE FFFFFFLRTTTTTTTTTTTTTTT + // + // S: Sign bit. + // E: Exponent bits. + // F: First 6 bits of fraction. + // L: Least significant bit of resulting bfloat16 if we truncate away the + // rest of the float32. This is also the 7th bit of fraction + // R: Rounding bit, 8th bit of fraction. + // T: Sticky bits, rest of fraction, 15 bits. + // + // To round half to nearest even, there are 3 cases where we want to round + // down (simply truncate the result of the bits away, which consists of + // rounding bit and sticky bits) and two cases where we want to round up + // (truncate then add one to the result). + // + // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of + // 1s) as the rounding bias, adds the rounding bias to the input, then + // truncates the last 16 bits away. + // + // To understand how it works, we can analyze this algorithm case by case: + // + // 1. L = 0, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input may create any carry, depending on + // whether there is any value set to 1 in T bits. + // - R may be set to 1 if there is a carry. + // - L remains 0. + // - Note that this case also handles Inf and -Inf, where all fraction + // bits, including L, R and Ts are all 0. The output remains Inf after + // this algorithm. + // + // 2. L = 1, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits but + // adds 1 to rounding bit. + // - L remains 1. + // + // 3. L = 0, R = 1, all of T are 0: + // Expect: round down, this is exactly at half, the result is already + // even (L=0). + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input sets all sticky bits to 1, but + // doesn't create a carry. + // - R remains 1. + // - L remains 0. + // + // 4. L = 1, R = 1: + // Expect: round up, this is exactly at half, the result needs to be + // round to the next even number. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits, but + // creates a carry from rounding bit. + // - The carry sets L to 0, creates another carry bit and propagate + // forward to F bits. + // - If all the F bits are 1, a carry then propagates to the exponent + // bits, which then creates the minimum value with the next exponent + // value. Note that we won't have the case where exponents are all 1, + // since that's either a NaN (handled in the other if condition) or inf + // (handled in case 1). + // + // 5. L = 0, R = 1, any of T is 1: + // Expect: round up, this is greater than half. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input creates a carry from sticky bits, + // sets rounding bit to 0, then create another carry. + // - The second carry sets L to 1. + // + // Examples: + // + // Exact half value that is already even: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1000000000000000 + // + // This falls into case 3. We truncate the rest of 16 bits and no + // carry is created into F and L: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // Exact half value, round to next even number: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1000000000000000 + // + // This falls into case 4. We create a carry from R and T, + // which then propagates into L and F: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // + // Max denormal value round to min normal value: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 + // + // Max normal value round to Inf: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 + + // At this point, ff must be either a normal float, or +/-infinity. + output = float_to_bfloat16_rtne(ff); + } + return output; +#endif +} + +// float_to_bfloat16_rtne template specialization that assumes that its function +// argument (ff) is either a normal floating point number, or +/-infinity, or +// zero. Used to improve the runtime performance of conversion from an integer +// type to bfloat16. +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) + // Nothing to do here +#else + numext::uint32_t input = numext::bit_cast(ff); + __bfloat16_raw output; + + // Least significant bit of resulting bfloat. + numext::uint32_t lsb = (input >> 16) & 1; + numext::uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + output.value = static_cast(input >> 16); + return output; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) { + float result = 0; + unsigned short* q = reinterpret_cast(&result); +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = h.value; +#else + q[1] = h.value; +#endif + return result; +} +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) { + EIGEN_USING_STD(isinf); + return (isinf)(float(a)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) { + EIGEN_USING_STD(isnan); + return (isnan)(float(a)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) { + bfloat16 result; + result.value = a.value & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { + return bfloat16(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { + return bfloat16(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { + return bfloat16(::logf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { + return bfloat16(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) { + return bfloat16(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) { + return bfloat16(static_cast(EIGEN_LOG2E) * ::logf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { + return bfloat16(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) { + return bfloat16(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { + return bfloat16(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) { + return bfloat16(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) { + return bfloat16(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) { + return bfloat16(::asinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) { + return bfloat16(::acosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) { + return bfloat16(::atanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) { + return bfloat16(::sinhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { + return bfloat16(::coshf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { + return bfloat16(::tanhf(float(a))); +} +#if EIGEN_HAS_CXX11_MATH +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { + return bfloat16(::asinhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { + return bfloat16(::acoshf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { + return bfloat16(::atanhf(float(a))); +} +#endif +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { + return bfloat16(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { + return bfloat16(::ceilf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { + return bfloat16(::rintf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { + return bfloat16(::roundf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) { + return bfloat16(::fmodf(float(a), float(b))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return bfloat16(::fminf(f1, f2)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return bfloat16(::fmaxf(f1, f2)); +} + +#ifndef EIGEN_NO_IO +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) { + os << static_cast(v); + return os; +} +#endif + +} // namespace bfloat16_impl + +namespace internal { + +template<> +struct random_default_impl +{ + static inline bfloat16 run(const bfloat16& x, const bfloat16& y) + { + return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX)); + } + static inline bfloat16 run() + { + return run(bfloat16(-1.f), bfloat16(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D); // bfloat16(5e-2f); + + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() { + return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); + } +}; + +} // namespace Eigen + +namespace Eigen { +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isnan)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isnan)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isinf)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isinf)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isfinite)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isfinite)(h); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast(const uint16_t& src) { + return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src)); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(const Eigen::bfloat16& src) { + return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src); +} + +} // namespace numext +} // namespace Eigen + +#if EIGEN_HAS_STD_HASH +namespace std { +template <> +struct hash { + EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const { + return static_cast(Eigen::numext::bit_cast(a)); + } +}; +} // namespace std +#endif + + +#endif // EIGEN_BFLOAT16_H diff --git a/inst/include/Eigen/src/Core/arch/Default/ConjHelper.h b/inst/include/Eigen/src/Core/arch/Default/ConjHelper.h index 4cfe34e0..53830b5a 100644 --- a/inst/include/Eigen/src/Core/arch/Default/ConjHelper.h +++ b/inst/include/Eigen/src/Core/arch/Default/ConjHelper.h @@ -11,19 +11,107 @@ #ifndef EIGEN_ARCH_CONJ_HELPER_H #define EIGEN_ARCH_CONJ_HELPER_H -#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ - }; \ - \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, \ + const PACKET_CPLX& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, \ + const PACKET_CPLX& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); \ + } \ + }; \ + \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, \ + const PACKET_REAL& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, \ + const PACKET_REAL& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); \ + } \ }; -#endif // EIGEN_ARCH_CONJ_HELPER_H +namespace Eigen { +namespace internal { + +template struct conj_if; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); } +}; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; } +}; + +// Generic Implementation, assume scalars since the packet-version is +// specialized below. +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsType& x, const RhsType& y) const + { return conj_if()(x) * conj_if()(y); } +}; + +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsScalar& x, const RhsScalar& y) const + { return numext::conj(x * y); } +}; + +// Implementation with equal type, use packet operations. +template +struct conj_helper +{ + typedef Packet ResultType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } +}; + +template +struct conj_helper +{ + typedef Packet ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return pconj(Eigen::internal::pmul(x, y)); } +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h new file mode 100644 index 00000000..c9fbaf68 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -0,0 +1,1649 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2009-2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The exp and log functions of this file initially come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Creates a Scalar integer type with same bit-width. +template struct make_integer; +template<> struct make_integer { typedef numext::int32_t type; }; +template<> struct make_integer { typedef numext::int64_t type; }; +template<> struct make_integer { typedef numext::int16_t type; }; +template<> struct make_integer { typedef numext::int16_t type; }; + +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic_get_biased_exponent(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::integer_packet PacketI; + enum { mantissa_bits = numext::numeric_limits::digits - 1}; + return pcast(plogical_shift_right(preinterpret(pabs(a)))); +} + +// Safely applies frexp, correctly handles denormals. +// Assumes IEEE floating point format. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic(const Packet& a, Packet& exponent) { + typedef typename unpacket_traits::type Scalar; + typedef typename make_unsigned::type>::type ScalarUI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = + ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000 + const Packet sign_mantissa_mask = pset1frombits(static_cast(scalar_sign_mantissa_mask)); + const Packet half = pset1(Scalar(0.5)); + const Packet zero = pzero(a); + const Packet normal_min = pset1((numext::numeric_limits::min)()); // Minimum normal value, 2^-126 + + // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1). + const Packet is_denormal = pcmp_lt(pabs(a), normal_min); + EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24 + // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr. + const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24 + const Packet normalization_factor = pset1(scalar_normalization_factor); + const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a); + + // Determine exponent offset: -126 if normal, -126-24 if denormal + const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126 + Packet exponent_offset = pset1(scalar_exponent_offset); + const Packet normalization_offset = pset1(-Scalar(scalar_normalization_offset)); // -24 + exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset); + + // Determine exponent and mantissa from normalized_a. + exponent = pfrexp_generic_get_biased_exponent(normalized_a); + // Zero, Inf and NaN return 'a' unmodified, exponent is zero + // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero) + const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)); // 255 + const Packet non_finite_exponent = pset1(scalar_non_finite_exponent); + const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent)); + const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half)); + exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset)); + return m; +} + +// Safely applies ldexp, correctly handles overflows, underflows and denormals. +// Assumes IEEE floating point format. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pldexp_generic(const Packet& a, const Packet& exponent) { + // We want to return a * 2^exponent, allowing for all possible integer + // exponents without overflowing or underflowing in intermediate + // computations. + // + // Since 'a' and the output can be denormal, the maximum range of 'exponent' + // to consider for a float is: + // -255-23 -> 255+23 + // Below -278 any finite float 'a' will become zero, and above +278 any + // finite float will become inf, including when 'a' is the smallest possible + // denormal. + // + // Unfortunately, 2^(278) cannot be represented using either one or two + // finite normal floats, so we must split the scale factor into at least + // three parts. It turns out to be faster to split 'exponent' into four + // factors, since [exponent>>2] is much faster to compute that [exponent/3]. + // + // Set e = min(max(exponent, -278), 278); + // b = floor(e/4); + // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b)) + // + // This will avoid any intermediate overflows and correctly handle 0, inf, + // NaN cases. + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + const Packet max_exponent = pset1(Scalar((ScalarI(1)<((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)); // 127 + const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); + Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b + Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) + out = pmul(out, c); + return out; +} + +// Explicitly multiplies +// a * (2^e) +// clamping e to the range +// [NumTraits::min_exponent()-2, NumTraits::max_exponent()] +// +// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow +// if 2^e doesn't fit into a normal floating-point Scalar. +// +// Assumes IEEE floating point format +template +struct pldexp_fast_impl { + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + Packet run(const Packet& a, const Packet& exponent) { + const Packet bias = pset1(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1))); // 127 + const Packet limit = pset1(Scalar((ScalarI(1)<(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 + // return a * (2^e) + return pmul(a, preinterpret(plogical_shift_left(e))); + } +}; + +// Natural or base 2 logarithm. +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +// TODO(gonnet): Further reduce the interval allowing for lower-degree +// polynomial interpolants -> ... -> profit! +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_impl_float(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0f); + const Packet cst_neg_half = pset1(-0.5f); + // The smallest non denormalized float number. + const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); + const Packet cst_minus_inf = pset1frombits( 0xff800000u); + const Packet cst_pos_inf = pset1frombits( 0x7f800000u); + + // Polynomial coefficients. + const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); + const Packet cst_cephes_log_p0 = pset1(7.0376836292E-2f); + const Packet cst_cephes_log_p1 = pset1(-1.1514610310E-1f); + const Packet cst_cephes_log_p2 = pset1(1.1676998740E-1f); + const Packet cst_cephes_log_p3 = pset1(-1.2420140846E-1f); + const Packet cst_cephes_log_p4 = pset1(+1.4249322787E-1f); + const Packet cst_cephes_log_p5 = pset1(-1.6668057665E-1f); + const Packet cst_cephes_log_p6 = pset1(+2.0000714765E-1f); + const Packet cst_cephes_log_p7 = pset1(-2.4999993993E-1f); + const Packet cst_cephes_log_p8 = pset1(+3.3333331174E-1f); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y2 = pmadd(y2, x, cst_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y = pmadd(cst_neg_half, x2, y); + x = padd(x, y); + + // Add the logarithm of the exponent back to the result of the interpolation. + if (base2) { + const Packet cst_log2e = pset1(static_cast(EIGEN_LOG2E)); + x = pmadd(x, cst_log2e, e); + } else { + const Packet cst_ln2 = pset1(static_cast(EIGEN_LN2)); + x = pmadd(e, cst_ln2, x); + } + + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x) +{ + return plog_impl_float(_x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_float(const Packet _x) +{ + return plog_impl_float(_x); +} + +/* Returns the base e (2.718...) or base 2 logarithm of x. + * The argument is separated into its exponent and fractional parts. + * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)], + * is approximated by + * + * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x). + * + * for more detail see: http://www.netlib.org/cephes/ + */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_impl_double(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0); + const Packet cst_neg_half = pset1(-0.5); + // The smallest non denormalized double. + const Packet cst_min_norm_pos = pset1frombits( static_cast(0x0010000000000000ull)); + const Packet cst_minus_inf = pset1frombits( static_cast(0xfff0000000000000ull)); + const Packet cst_pos_inf = pset1frombits( static_cast(0x7ff0000000000000ull)); + + + // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x) + // 1/sqrt(2) <= x < sqrt(2) + const Packet cst_cephes_SQRTHF = pset1(0.70710678118654752440E0); + const Packet cst_cephes_log_p0 = pset1(1.01875663804580931796E-4); + const Packet cst_cephes_log_p1 = pset1(4.97494994976747001425E-1); + const Packet cst_cephes_log_p2 = pset1(4.70579119878881725854E0); + const Packet cst_cephes_log_p3 = pset1(1.44989225341610930846E1); + const Packet cst_cephes_log_p4 = pset1(1.79368678507819816313E1); + const Packet cst_cephes_log_p5 = pset1(7.70838733755885391666E0); + + const Packet cst_cephes_log_q0 = pset1(1.0); + const Packet cst_cephes_log_q1 = pset1(1.12873587189167450590E1); + const Packet cst_cephes_log_q2 = pset1(4.52279145837532221105E1); + const Packet cst_cephes_log_q3 = pset1(8.29875266912776603211E1); + const Packet cst_cephes_log_q4 = pset1(7.11544750618563894466E1); + const Packet cst_cephes_log_q5 = pset1(2.31251620126765340583E1); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant , probably to improve instruction-level parallelism. + // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) ); + Packet y, y1, y_; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y_ = pmadd(y, x3, y1); + + y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1); + y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4); + y = pmadd(y, x, cst_cephes_log_q2); + y1 = pmadd(y1, x, cst_cephes_log_q5); + y = pmadd(y, x3, y1); + + y_ = pmul(y_, x3); + y = pdiv(y_, y); + + y = pmadd(cst_neg_half, x2, y); + x = padd(x, y); + + // Add the logarithm of the exponent back to the result of the interpolation. + if (base2) { + const Packet cst_log2e = pset1(static_cast(EIGEN_LOG2E)); + x = pmadd(x, cst_log2e, e); + } else { + const Packet cst_ln2 = pset1(static_cast(EIGEN_LN2)); + x = pmadd(e, cst_ln2, x); + } + + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_double(const Packet _x) +{ + return plog_impl_double(_x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_double(const Packet _x) +{ + return plog_impl_double(_x); +} + +/** \internal \returns log(1 + x) computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_plog1p(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + Packet xp1 = padd(x, one); + Packet small_mask = pcmp_eq(xp1, one); + Packet log1 = plog(xp1); + Packet inf_mask = pcmp_eq(xp1, log1); + Packet log_large = pmul(x, pdiv(log1, psub(xp1, one))); + return pselect(por(small_mask, inf_mask), x, log_large); +} + +/** \internal \returns exp(x)-1 computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_expm1(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + const Packet neg_one = pset1(ScalarType(-1)); + Packet u = pexp(x); + Packet one_mask = pcmp_eq(u, one); + Packet u_minus_one = psub(u, one); + Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one); + Packet logu = plog(u); + // The following comparison is to catch the case where + // exp(x) = +inf. It is written in this way to avoid having + // to form the constant +inf, which depends on the packet + // type. + Packet pos_inf_mask = pcmp_eq(logu, u); + Packet expm1 = pmul(u_minus_one, pdiv(x, logu)); + expm1 = pselect(pos_inf_mask, u, expm1); + return pselect(one_mask, + x, + pselect(neg_one_mask, + neg_one, + expm1)); +} + + +// Exponential function. Works by writing "x = m*log(2) + r" where +// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then +// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_float(const Packet _x) +{ + const Packet cst_1 = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + const Packet cst_exp_hi = pset1( 88.723f); + const Packet cst_exp_lo = pset1(-88.723f); + + const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); + const Packet cst_cephes_exp_p0 = pset1(1.9875691500E-4f); + const Packet cst_cephes_exp_p1 = pset1(1.3981999507E-3f); + const Packet cst_cephes_exp_p2 = pset1(8.3334519073E-3f); + const Packet cst_cephes_exp_p3 = pset1(4.1665795894E-2f); + const Packet cst_cephes_exp_p4 = pset1(1.6666665459E-1f); + const Packet cst_cephes_exp_p5 = pset1(5.0000001201E-1f); + + // Clamp x. + Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo); + + // Express exp(x) as exp(m*ln(2) + r), start by extracting + // m = floor(x/ln(2) + 0.5). + Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half)); + + // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is + // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating + // truncation errors. + const Packet cst_cephes_exp_C1 = pset1(-0.693359375f); + const Packet cst_cephes_exp_C2 = pset1(2.12194440e-4f); + Packet r = pmadd(m, cst_cephes_exp_C1, x); + r = pmadd(m, cst_cephes_exp_C2, r); + + Packet r2 = pmul(r, r); + Packet r3 = pmul(r2, r); + + // Evaluate the polynomial approximant,improved by instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1); + y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4); + y2 = padd(r, cst_1); + y = pmadd(y, r, cst_cephes_exp_p2); + y1 = pmadd(y1, r, cst_cephes_exp_p5); + y = pmadd(y, r3, y1); + y = pmadd(y, r2, y2); + + // Return 2^m * exp(r). + // TODO: replace pldexp with faster implementation since y in [-1, 1). + return pmax(pldexp(y,m), _x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_double(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0); + const Packet cst_2 = pset1(2.0); + const Packet cst_half = pset1(0.5); + + const Packet cst_exp_hi = pset1(709.784); + const Packet cst_exp_lo = pset1(-709.784); + + const Packet cst_cephes_LOG2EF = pset1(1.4426950408889634073599); + const Packet cst_cephes_exp_p0 = pset1(1.26177193074810590878e-4); + const Packet cst_cephes_exp_p1 = pset1(3.02994407707441961300e-2); + const Packet cst_cephes_exp_p2 = pset1(9.99999999999999999910e-1); + const Packet cst_cephes_exp_q0 = pset1(3.00198505138664455042e-6); + const Packet cst_cephes_exp_q1 = pset1(2.52448340349684104192e-3); + const Packet cst_cephes_exp_q2 = pset1(2.27265548208155028766e-1); + const Packet cst_cephes_exp_q3 = pset1(2.00000000000000000009e0); + const Packet cst_cephes_exp_C1 = pset1(0.693145751953125); + const Packet cst_cephes_exp_C2 = pset1(1.42860682030941723212e-6); + + Packet tmp, fx; + + // clamp x + x = pmax(pmin(x, cst_exp_hi), cst_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(cst_cephes_LOG2EF, x, cst_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = pfloor(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, cst_cephes_exp_C1); + Packet z = pmul(fx, cst_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet px = cst_cephes_exp_p0; + px = pmadd(px, x2, cst_cephes_exp_p1); + px = pmadd(px, x2, cst_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet qx = cst_cephes_exp_q0; + qx = pmadd(qx, x2, cst_cephes_exp_q1); + qx = pmadd(qx, x2, cst_cephes_exp_q2); + qx = pmadd(qx, x2, cst_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = pdiv(px, psub(qx, px)); + x = pmadd(cst_2, x, cst_1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + // TODO: replace pldexp with faster implementation since x in [-1, 1). + return pmax(pldexp(x,fx), _x); +} + +// The following code is inspired by the following stack-overflow answer: +// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 +// It has been largely optimized: +// - By-pass calls to frexp. +// - Aligned loads of required 96 bits of 2/pi. This is accomplished by +// (1) balancing the mantissa and exponent to the required bits of 2/pi are +// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. +// - Avoid a branch in rounding and extraction of the remaining fractional part. +// Overall, I measured a speed up higher than x2 on x86-64. +inline float trig_reduce_huge (float xf, int *quadrant) +{ + using Eigen::numext::int32_t; + using Eigen::numext::uint32_t; + using Eigen::numext::int64_t; + using Eigen::numext::uint64_t; + + const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62 + const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt + + // 192 bits of 2/pi for Payne-Hanek reduction + // Bits are introduced by packet of 8 to enable aligned reads. + static const uint32_t two_over_pi [] = + { + 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, + 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, + 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, + 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, + 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566, + 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, + 0x10e41000, 0xe4100000 + }; + + uint32_t xi = numext::bit_cast(xf); + // Below, -118 = -126 + 8. + // -126 is to get the exponent, + // +8 is to enable alignment of 2/pi's bits on 8 bits. + // This is possible because the fractional part of x as only 24 meaningful bits. + uint32_t e = (xi >> 23) - 118; + // Extract the mantissa and shift it to align it wrt the exponent + xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7); + + uint32_t i = e >> 3; + uint32_t twoopi_1 = two_over_pi[i-1]; + uint32_t twoopi_2 = two_over_pi[i+3]; + uint32_t twoopi_3 = two_over_pi[i+7]; + + // Compute x * 2/pi in 2.62-bit fixed-point format. + uint64_t p; + p = uint64_t(xi) * twoopi_3; + p = uint64_t(xi) * twoopi_2 + (p >> 32); + p = (uint64_t(xi * twoopi_1) << 32) + p; + + // Round to nearest: add 0.5 and extract integral part. + uint64_t q = (p + zero_dot_five) >> 62; + *quadrant = int(q); + // Now it remains to compute "r = x - q*pi/2" with high accuracy, + // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as: + // r = (p-q)*pi/2, + // where the product can be be carried out with sufficient accuracy using double precision. + p -= q<<62; + return float(double(int64_t(p)) * pio2_62); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT +__attribute__((optimize("-fno-unsafe-math-optimizations"))) +#endif +Packet psincos_float(const Packet& _x) +{ + typedef typename unpacket_traits::integer_packet PacketI; + + const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI + const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding + const PacketI csti_1 = pset1(1); + const Packet cst_sign_mask = pset1frombits(0x80000000u); + + Packet x = pabs(_x); + + // Scale x by 2/Pi to find x's octant. + Packet y = pmul(x, cst_2oPI); + + // Rounding trick: + Packet y_round = padd(y, cst_rounding_magic); + EIGEN_OPTIMIZATION_BARRIER(y_round) + PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) + y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // using "Extended precision modular arithmetic" + #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) + // This version requires true FMA for high accuracy + // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): + const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; + x = pmadd(y, pset1(-1.57079601287841796875f), x); + x = pmadd(y, pset1(-3.1391647326017846353352069854736328125e-07f), x); + x = pmadd(y, pset1(-5.390302529957764765544681040410068817436695098876953125e-15f), x); + #else + // Without true FMA, the previous set of coefficients maintain 1ULP accuracy + // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7. + // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs. + + // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively. + // and 2 ULP up to: + const float huge_th = ComputeSine ? 25966.f : 18838.f; + x = pmadd(y, pset1(-1.5703125), x); // = 0xbfc90000 + EIGEN_OPTIMIZATION_BARRIER(x) + x = pmadd(y, pset1(-0.000483989715576171875), x); // = 0xb9fdc000 + EIGEN_OPTIMIZATION_BARRIER(x) + x = pmadd(y, pset1(1.62865035235881805419921875e-07), x); // = 0x342ee000 + x = pmadd(y, pset1(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee + + // For the record, the following set of coefficients maintain 2ULP up + // to a slightly larger range: + // const float huge_th = ComputeSine ? 51981.f : 39086.125f; + // but it slightly fails to maintain 1ULP for two values of sin below pi. + // x = pmadd(y, pset1(-3.140625/2.), x); + // x = pmadd(y, pset1(-0.00048351287841796875), x); + // x = pmadd(y, pset1(-3.13855707645416259765625e-07), x); + // x = pmadd(y, pset1(-6.0771006282767103812147979624569416046142578125e-11), x); + + // For the record, with only 3 iterations it is possible to maintain + // 1 ULP up to 3PI (maybe more) and 2ULP up to 255. + // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee + #endif + + if(predux_any(pcmp_le(pset1(huge_th),pabs(_x)))) + { + const int PacketSize = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize]; + pstoreu(vals, pabs(_x)); + pstoreu(x_cpy, x); + pstoreu(y_int2, y_int); + for(int k=0; k=huge_th && (numext::isfinite)(val)) + x_cpy[k] = trig_reduce_huge(val,&y_int2[k]); + } + x = ploadu(x_cpy); + y_int = ploadu(y_int2); + } + + // Compute the sign to apply to the polynomial. + // sin: sign = second_bit(y_int) xor signbit(_x) + // cos: sign = second_bit(y_int+1) + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(plogical_shift_left<30>(y_int))) + : preinterpret(plogical_shift_left<30>(padd(y_int,csti_1))); + sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit + + // Get the polynomial selection mask from the second bit of y_int + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); + + Packet x2 = pmul(x,x); + + // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) + Packet y1 = pset1(2.4372266125283204019069671630859375e-05f); + y1 = pmadd(y1, x2, pset1(-0.00138865201734006404876708984375f )); + y1 = pmadd(y1, x2, pset1(0.041666619479656219482421875f )); + y1 = pmadd(y1, x2, pset1(-0.5f)); + y1 = pmadd(y1, x2, pset1(1.f)); + + // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) + // octave/matlab code to compute those coefficients: + // x = (0:0.0001:pi/4)'; + // A = [x.^3 x.^5 x.^7]; + // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy + // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 + // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) + // + Packet y2 = pset1(-0.0001959234114083702898469196984621021329076029360294342041015625f); + y2 = pmadd(y2, x2, pset1( 0.0083326873655616851693794799871284340042620897293090820312500000f)); + y2 = pmadd(y2, x2, pset1(-0.1666666203982298255503735617821803316473960876464843750000000000f)); + y2 = pmul(y2, x2); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynomials. + y = ComputeSine ? pselect(poly_mask,y2,y1) + : pselect(poly_mask,y1,y2); + + // Update the sign and filter huge inputs + return pxor(y, sign_bit); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& x) +{ + return psincos_float(x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pcos_float(const Packet& x) +{ + return psincos_float(x); +} + + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename unpacket_traits::as_real RealPacket; + + // Computes the principal sqrt of the complex numbers in the input. + // + // For example, for packets containing 2 complex numbers stored in interleaved format + // a = [a0, a1] = [x0, y0, x1, y1], + // where x0 = real(a0), y0 = imag(a0) etc., this function returns + // b = [b0, b1] = [u0, v0, u1, v1], + // such that b0^2 = a0, b1^2 = a1. + // + // To derive the formula for the complex square roots, let's consider the equation for + // a single complex square root of the number x + i*y. We want to find real numbers + // u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = 0.5 * (y / u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = 0.5 * (y / v) + // + // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as + // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , + + // In the following, without lack of generality, we have annotated the code, assuming + // that the input is a packet of 2 complex numbers. + // + // Step 1. Compute l = [l0, l0, l1, l1], where + // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2) + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), + // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. + + RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|] + RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|] + RealPacket a_max = pmax(a_abs, a_abs_flip); + RealPacket a_min = pmin(a_abs, a_abs_flip); + RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); + RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max)); + RealPacket r = pdiv(a_min, a_max); + const RealPacket cst_one = pset1(RealScalar(1)); + RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1] + // Set l to a_max if a_min is zero. + l = pselect(a_min_zero_mask, a_max, l); + + // Step 2. Compute [rho0, *, rho1, *], where + // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|)) + // We don't care about the imaginary parts computed here. They will be overwritten later. + const RealPacket cst_half = pset1(RealScalar(0.5)); + Packet rho; + rho.v = psqrt(pmul(cst_half, padd(a_abs, l))); + + // Step 3. Compute [rho0, eta0, rho1, eta1], where + // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2. + // set eta = 0 of input is 0 + i0. + RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask); + RealPacket real_mask = peven_mask(a.v); + Packet positive_real_result; + // Compute result for inputs with positive real part. + positive_real_result.v = pselect(real_mask, rho.v, eta); + + // Step 4. Compute solution for inputs with negative real part: + // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1] + const RealScalar neg_zero = RealScalar(numext::bit_cast(0x80000000u)); + const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), neg_zero)).v; + RealPacket imag_signs = pand(a.v, cst_imag_sign_mask); + Packet negative_real_result; + // Notice that rho is positive, so taking it's absolute value is a noop. + negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs); + + // Step 5. Select solution branch based on the sign of the real parts. + Packet negative_real_mask; + negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v)); + negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v); + Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result); + + // Step 6. Handle special cases for infinities: + // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN + // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN + // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y + // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + Packet is_inf; + is_inf.v = pcmp_eq(a_abs, cst_pos_inf); + Packet is_real_inf; + is_real_inf.v = pand(is_inf.v, real_mask); + is_real_inf = por(is_real_inf, pcplxflip(is_real_inf)); + // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. + Packet real_inf_result; + real_inf_result.v = pmul(a_abs, pset1(Scalar(RealScalar(1.0), RealScalar(0.0))).v); + real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v); + // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. + Packet is_imag_inf; + is_imag_inf.v = pandnot(is_inf.v, real_mask); + is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf)); + Packet imag_inf_result; + imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask)); + + return pselect(is_imag_inf, imag_inf_result, + pselect(is_real_inf, real_inf_result,result)); +} + +// TODO(rmlarsen): The following set of utilities for double word arithmetic +// should perhaps be refactored as a separate file, since it would be generally +// useful for special function implementation etc. Writing the algorithms in +// terms if a double word type would also make the code more readable. + +// This function splits x into the nearest integer n and fractional part r, +// such that x = n + r holds exactly. +template +EIGEN_STRONG_INLINE +void absolute_split(const Packet& x, Packet& n, Packet& r) { + n = pround(x); + r = psub(x, n); +} + +// This function computes the sum {s, r}, such that x + y = s_hi + s_lo +// holds exactly, and s_hi = fl(x+y), if |x| >= |y|. +template +EIGEN_STRONG_INLINE +void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { + s_hi = padd(x, y); + const Packet t = psub(s_hi, x); + s_lo = psub(y, t); +} + +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +// This function implements the extended precision product of +// a pair of floating point numbers. Given {x, y}, it computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x, const Packet& y, + Packet& p_hi, Packet& p_lo) { + p_hi = pmul(x, y); + p_lo = pmadd(x, y, pnegate(p_hi)); +} + +#else + +// This function implements the Veltkamp splitting. Given a floating point +// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds +// exactly and that half of the significant of x fits in x_hi. +// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE +void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { + typedef typename unpacket_traits::type Scalar; + EIGEN_CONSTEXPR int shift = (NumTraits::digits() + 1) / 2; + const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr. + const Packet gamma = pmul(pset1(shift_scale + Scalar(1)), x); + Packet rho = psub(x, gamma); + x_hi = padd(rho, gamma); + x_lo = psub(x, x_hi); +} + +// This function implements Dekker's algorithm for products x * y. +// Given floating point numbers {x, y} computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x, const Packet& y, + Packet& p_hi, Packet& p_lo) { + Packet x_hi, x_lo, y_hi, y_lo; + veltkamp_splitting(x, x_hi, x_lo); + veltkamp_splitting(y, y_hi, y_lo); + + p_hi = pmul(x, y); + p_lo = pmadd(x_hi, y_hi, pnegate(p_hi)); + p_lo = pmadd(x_hi, y_lo, p_lo); + p_lo = pmadd(x_lo, y_hi, p_lo); + p_lo = pmadd(x_lo, y_lo, p_lo); +} + +#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD + + +// This function implements Dekker's algorithm for the addition +// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {s_hi, s_lo} such that +// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly. +// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE + void twosum(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi)); + Packet r_hi_1, r_lo_1; + fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1); + Packet r_hi_2, r_lo_2; + fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2); + const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2); + + const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo); + const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo); + const Packet s = pselect(x_greater_mask, s1, s2); + + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for double word numbers, +// which assumes that |x_hi| >= |y_hi|. +template +EIGEN_STRONG_INLINE + void fast_twosum(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x_hi, y_hi, r_hi, r_lo); + const Packet s = padd(padd(y_lo, r_lo), x_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for adding a floating point number x to +// double word number {y_hi, y_lo} number, with the assumption +// that |x| >= |y_hi|. +template +EIGEN_STRONG_INLINE +void fast_twosum(const Packet& x, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x, y_hi, r_hi, r_lo); + const Packet s = padd(y_lo, r_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This function implements the multiplication of a double word +// number represented by {x_hi, x_lo} by a floating point number y. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, + Packet& p_hi, Packet& p_lo) { + Packet c_hi, c_lo1; + twoprod(x_hi, y, c_hi, c_lo1); + const Packet c_lo2 = pmul(x_lo, y); + Packet t_hi, t_lo1; + fast_twosum(c_hi, c_lo2, t_hi, t_lo1); + const Packet t_lo2 = padd(t_lo1, c_lo1); + fast_twosum(t_hi, t_lo2, p_hi, p_lo); +} + +// This function implements the multiplication of two double word +// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& p_hi, Packet& p_lo) { + Packet p_hi_hi, p_hi_lo; + twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo); + Packet p_lo_hi, p_lo_lo; + twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo); + fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo); +} + +// This function computes the reciprocal of a floating point number +// with extra precision and returns the result as a double word. +template +void doubleword_reciprocal(const Packet& x, Packet& recip_hi, Packet& recip_lo) { + typedef typename unpacket_traits::type Scalar; + // 1. Approximate the reciprocal as the reciprocal of the high order element. + Packet approx_recip = prsqrt(x); + approx_recip = pmul(approx_recip, approx_recip); + + // 2. Run one step of Newton-Raphson iteration in double word arithmetic + // to get the bottom half. The NR iteration for reciprocal of 'a' is + // x_{i+1} = x_i * (2 - a * x_i) + + // -a*x_i + Packet t1_hi, t1_lo; + twoprod(pnegate(x), approx_recip, t1_hi, t1_lo); + // 2 - a*x_i + Packet t2_hi, t2_lo; + fast_twosum(pset1(Scalar(2)), t1_hi, t2_hi, t2_lo); + Packet t3_hi, t3_lo; + fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo); + // x_i * (2 - a * x_i) + twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo); +} + + +// This function computes log2(x) and returns the result as a double word. +template +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) { + log2_x_hi = plog2(x); + log2_x_lo = pzero(x); + } +}; + +// This specialization uses a more accurate algorithm to compute log2(x) for +// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10. +// This additional accuracy is needed to counter the error-magnification +// inherent in multiplying by a potentially large exponent in pow(x,y). +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) { + // The function log(1+x)/x is approximated in the interval + // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form + // Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))), + // where the degree 6 polynomial P(x) is evaluated in single precision, + // while the remaining 4 terms of Q(x), as well as the final multiplication by x + // to reconstruct log(1+x) are evaluated in extra precision using + // double word arithmetic. C0 through C3 are extra precise constants + // stored as double words. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 10; + // > f = log2(1+x)/x; + // > interval = [sqrt(0.5)-1;sqrt(2)-1]; + // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating); + + const Packet p6 = pset1( 9.703654795885e-2f); + const Packet p5 = pset1(-0.1690667718648f); + const Packet p4 = pset1( 0.1720575392246f); + const Packet p3 = pset1(-0.1789081543684f); + const Packet p2 = pset1( 0.2050433009862f); + const Packet p1 = pset1(-0.2404672354459f); + const Packet p0 = pset1( 0.2885761857032f); + + const Packet C3_hi = pset1(-0.360674142838f); + const Packet C3_lo = pset1(-6.13283912543e-09f); + const Packet C2_hi = pset1(0.480897903442f); + const Packet C2_lo = pset1(-1.44861207474e-08f); + const Packet C1_hi = pset1(-0.721347510815f); + const Packet C1_lo = pset1(-4.84483164698e-09f); + const Packet C0_hi = pset1(1.44269502163f); + const Packet C0_lo = pset1(2.01711713999e-08f); + const Packet one = pset1(1.0f); + + const Packet x = psub(z, one); + // Evaluate P(x) in working precision. + // We evaluate it in multiple parts to improve instruction level + // parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p6, x2, p4); + p_even = pmadd(p_even, x2, p2); + p_even = pmadd(p_even, x2, p0); + Packet p_odd = pmadd(p5, x2, p3); + p_odd = pmadd(p_odd, x2, p1); + Packet p = pmadd(p_odd, x, p_even); + + // Now evaluate the low-order tems of Q(x) in double word precision. + // In the following, due to the alternating signs and the fact that + // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use + // fast_twosum instead of the slower twosum. + Packet q_hi, q_lo; + Packet t_hi, t_lo; + // C3 + x * p(x) + twoprod(p, x, t_hi, t_lo); + fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo); + // C2 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo); + // C1 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo); + // C0 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo); + + // log(z) ~= x * Q(x) + twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo); + } +}; + +// This specialization uses a more accurate algorithm to compute log2(x) for +// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18. +// This additional accuracy is needed to counter the error-magnification +// inherent in multiplying by a potentially large exponent in pow(x,y). +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. + +template <> +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) { + // We use a transformation of variables: + // r = c * (x-1) / (x+1), + // such that + // log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r). + // The function f(r) can be approximated well using an odd polynomial + // of the form + // P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r, + // For the implementation of log2 here, Q is of degree 6 with + // coefficient represented in working precision (double), while C is a + // constant represented in extra precision as a double word to achieve + // full accuracy. + // + // The polynomial coefficients were computed by the Sollya script: + // + // c = 2 / log(2); + // trans = c * (x-1)/(x+1); + // itrans = (1+x/c)/(1-x/c); + // interval=[trans(sqrt(0.5)); trans(sqrt(2))]; + // print(interval); + // f = log2(itrans(x)); + // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating); + const Packet q12 = pset1(2.87074255468000586e-9); + const Packet q10 = pset1(2.38957980901884082e-8); + const Packet q8 = pset1(2.31032094540014656e-7); + const Packet q6 = pset1(2.27279857398537278e-6); + const Packet q4 = pset1(2.31271023278625638e-5); + const Packet q2 = pset1(2.47556738444535513e-4); + const Packet q0 = pset1(2.88543873228900172e-3); + const Packet C_hi = pset1(0.0400377511598501157); + const Packet C_lo = pset1(-4.77726582251425391e-19); + const Packet one = pset1(1.0); + + const Packet cst_2_log2e_hi = pset1(2.88539008177792677); + const Packet cst_2_log2e_lo = pset1(4.07660016854549667e-17); + // c * (x - 1) + Packet num_hi, num_lo; + twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), num_hi, num_lo); + // TODO(rmlarsen): Investigate if using the division algorithm by + // Muller et al. is faster/more accurate. + // 1 / (x + 1) + Packet denom_hi, denom_lo; + doubleword_reciprocal(padd(x, one), denom_hi, denom_lo); + // r = c * (x-1) / (x+1), + Packet r_hi, r_lo; + twoprod(num_hi, num_lo, denom_hi, denom_lo, r_hi, r_lo); + // r2 = r * r + Packet r2_hi, r2_lo; + twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo); + // r4 = r2 * r2 + Packet r4_hi, r4_lo; + twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo); + + // Evaluate Q(r^2) in working precision. We evaluate it in two parts + // (even and odd in r^2) to improve instruction level parallelism. + Packet q_even = pmadd(q12, r4_hi, q8); + Packet q_odd = pmadd(q10, r4_hi, q6); + q_even = pmadd(q_even, r4_hi, q4); + q_odd = pmadd(q_odd, r4_hi, q2); + q_even = pmadd(q_even, r4_hi, q0); + Packet q = pmadd(q_odd, r2_hi, q_even); + + // Now evaluate the low order terms of P(x) in double word precision. + // In the following, due to the increasing magnitude of the coefficients + // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead + // of the slower twosum. + // Q(r^2) * r^2 + Packet p_hi, p_lo; + twoprod(r2_hi, r2_lo, q, p_hi, p_lo); + // Q(r^2) * r^2 + C + Packet p1_hi, p1_lo; + fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo); + // (Q(r^2) * r^2 + C) * r^2 + Packet p2_hi, p2_lo; + twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo); + // ((Q(r^2) * r^2 + C) * r^2 + 1) + Packet p3_hi, p3_lo; + fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo); + + // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r + twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo); + } +}; + +// This function computes exp2(x) (i.e. 2**x). +template +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // TODO(rmlarsen): Add a pexp2 packetop. + return pexp(pmul(pset1(Scalar(EIGEN_LN2)), x)); + } +}; + +// This specialization uses a faster algorithm to compute exp2(x) for floats +// in [-0.5;0.5] with a relative accuracy of 1 ulp. +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // This function approximates exp2(x) by a degree 6 polynomial of the form + // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in + // single precision, and the remaining steps are evaluated with extra precision using + // double word arithmetic. C is an extra precise constant stored as a double word. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 6; + // > f = 2^x; + // > interval = [-0.5;0.5]; + // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating); + + const Packet p4 = pset1(1.539513905e-4f); + const Packet p3 = pset1(1.340007293e-3f); + const Packet p2 = pset1(9.618283249e-3f); + const Packet p1 = pset1(5.550328270e-2f); + const Packet p0 = pset1(0.2402264923f); + + const Packet C_hi = pset1(0.6931471825f); + const Packet C_lo = pset1(2.36836577e-08f); + const Packet one = pset1(1.0f); + + // Evaluate P(x) in working precision. + // We evaluate even and odd parts of the polynomial separately + // to gain some instruction level parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p4, x2, p2); + Packet p_odd = pmadd(p3, x2, p1); + p_even = pmadd(p_even, x2, p0); + Packet p = pmadd(p_odd, x, p_even); + + // Evaluate the remaining terms of Q(x) with extra precision using + // double word arithmetic. + Packet p_hi, p_lo; + // x * p(x) + twoprod(p, x, p_hi, p_lo); + // C + x * p(x) + Packet q1_hi, q1_lo; + twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo); + // x * (C + x * p(x)) + Packet q2_hi, q2_lo; + twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo); + // 1 + x * (C + x * p(x)) + Packet q3_hi, q3_lo; + // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum + // for adding it to unity here. + fast_twosum(one, q2_hi, q3_hi, q3_lo); + return padd(q3_hi, padd(q2_lo, q3_lo)); + } +}; + +// in [-0.5;0.5] with a relative accuracy of 1 ulp. +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // This function approximates exp2(x) by a degree 10 polynomial of the form + // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in + // single precision, and the remaining steps are evaluated with extra precision using + // double word arithmetic. C is an extra precise constant stored as a double word. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 11; + // > f = 2^x; + // > interval = [-0.5;0.5]; + // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating); + + const Packet p9 = pset1(4.431642109085495276e-10); + const Packet p8 = pset1(7.073829923303358410e-9); + const Packet p7 = pset1(1.017822306737031311e-7); + const Packet p6 = pset1(1.321543498017646657e-6); + const Packet p5 = pset1(1.525273342728892877e-5); + const Packet p4 = pset1(1.540353045780084423e-4); + const Packet p3 = pset1(1.333355814685869807e-3); + const Packet p2 = pset1(9.618129107593478832e-3); + const Packet p1 = pset1(5.550410866481961247e-2); + const Packet p0 = pset1(0.240226506959101332); + const Packet C_hi = pset1(0.693147180559945286); + const Packet C_lo = pset1(4.81927865669806721e-17); + const Packet one = pset1(1.0); + + // Evaluate P(x) in working precision. + // We evaluate even and odd parts of the polynomial separately + // to gain some instruction level parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p8, x2, p6); + Packet p_odd = pmadd(p9, x2, p7); + p_even = pmadd(p_even, x2, p4); + p_odd = pmadd(p_odd, x2, p5); + p_even = pmadd(p_even, x2, p2); + p_odd = pmadd(p_odd, x2, p3); + p_even = pmadd(p_even, x2, p0); + p_odd = pmadd(p_odd, x2, p1); + Packet p = pmadd(p_odd, x, p_even); + + // Evaluate the remaining terms of Q(x) with extra precision using + // double word arithmetic. + Packet p_hi, p_lo; + // x * p(x) + twoprod(p, x, p_hi, p_lo); + // C + x * p(x) + Packet q1_hi, q1_lo; + twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo); + // x * (C + x * p(x)) + Packet q2_hi, q2_lo; + twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo); + // 1 + x * (C + x * p(x)) + Packet q3_hi, q3_lo; + // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum + // for adding it to unity here. + fast_twosum(one, q2_hi, q3_hi, q3_lo); + return padd(q3_hi, padd(q2_lo, q3_lo)); + } +}; + +// This function implements the non-trivial case of pow(x,y) where x is +// positive and y is (possibly) non-integer. +// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x. +// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it +// easier to specialize or turn off for specific types and/or backends.x +template +EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + // Split x into exponent e_x and mantissa m_x. + Packet e_x; + Packet m_x = pfrexp(x, e_x); + + // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x). + EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440); + const Packet m_x_scale_mask = pcmp_lt(m_x, pset1(sqrt_half)); + m_x = pselect(m_x_scale_mask, pmul(pset1(Scalar(2)), m_x), m_x); + e_x = pselect(m_x_scale_mask, psub(e_x, pset1(Scalar(1))), e_x); + + // Compute log2(m_x) with 6 extra bits of accuracy. + Packet rx_hi, rx_lo; + accurate_log2()(m_x, rx_hi, rx_lo); + + // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled + // precision using double word arithmetic. + Packet f1_hi, f1_lo, f2_hi, f2_lo; + twoprod(e_x, y, f1_hi, f1_lo); + twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo); + // Sum the two terms in f using double word arithmetic. We know + // that |e_x| > |log2(m_x)|, except for the case where e_x==0. + // This means that we can use fast_twosum(f1,f2). + // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any + // accuracy by violating the assumption of fast_twosum, because + // it's a no-op. + Packet f_hi, f_lo; + fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo); + + // Split f into integer and fractional parts. + Packet n_z, r_z; + absolute_split(f_hi, n_z, r_z); + r_z = padd(r_z, f_lo); + Packet n_r; + absolute_split(r_z, n_r, r_z); + n_z = padd(n_z, n_r); + + // We now have an accurate split of f = n_z + r_z and can compute + // x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}. + // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy + // using a specialized algorithm. Multiplication by the second factor can + // be done exactly using pldexp(), since it is an integer power of 2. + const Packet e_r = fast_accurate_exp2()(r_z); + return pldexp(e_r, n_z); +} + +// Generic implementation of pow(x,y). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet generic_pow(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + + const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_zero = pset1(Scalar(0)); + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_nan = pset1(NumTraits::quiet_NaN()); + + const Packet abs_x = pabs(x); + // Predicates for sign and magnitude of x. + const Packet x_is_zero = pcmp_eq(x, cst_zero); + const Packet x_is_neg = pcmp_lt(x, cst_zero); + const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); + const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); + const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); + + // Predicates for sign and magnitude of y. + const Packet y_is_one = pcmp_eq(y, cst_one); + const Packet y_is_zero = pcmp_eq(y, cst_zero); + const Packet y_is_neg = pcmp_lt(y, cst_zero); + const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); + const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + EIGEN_CONSTEXPR Scalar huge_exponent = + (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / + NumTraits::epsilon(); + const Packet abs_y_is_huge = pcmp_le(pset1(huge_exponent), pabs(y)); + + // Predicates for whether y is integer and/or even. + const Packet y_is_int = pcmp_eq(pfloor(y), y); + const Packet y_div_2 = pmul(y, pset1(Scalar(0.5))); + const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); + + // Predicates encoding special cases for the value of pow(x,y) + const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), + y_is_int), + abs_y_is_inf); + const Packet pow_is_one = por(por(x_is_one, y_is_zero), + pand(x_is_neg_one, + por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); + const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), + pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), + y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), + y_is_neg)); + const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), + pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), + y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), + y_is_pos)); + + // General computation of pow(x,y) for positive x or negative x and integer y. + const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); + const Packet pow_abs = generic_pow_impl(abs_x, y); + return pselect(y_is_one, x, + pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, cst_pos_inf, + pselect(pow_is_zero, cst_zero, + pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); +} + + + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + return pmadd(ppolevl::run(x, coeff), x, pset1(coeff[N])); + } +}; + +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { + EIGEN_UNUSED_VARIABLE(x); + return pset1(coeff[0]); + } +}; + +/* chbevl (modified for Eigen) + * + * Evaluate Chebyshev series + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N], chebevl(); + * + * y = chbevl( x, coef, N ); + * + * + * + * DESCRIPTION: + * + * Evaluates the series + * + * N-1 + * - ' + * y = > coef[i] T (x/2) + * - i + * i=0 + * + * of Chebyshev polynomials Ti at argument x/2. + * + * Coefficients are stored in reverse order, i.e. the zero + * order term is last in the array. Note N is the number of + * coefficients, not the order. + * + * If coefficients are for the interval a to b, x must + * have been transformed to x -> 2(2x - b - a)/(b-a) before + * entering the routine. This maps x from (a, b) to (-1, 1), + * over which the Chebyshev polynomials are defined. + * + * If the coefficients are for the inverted interval, in + * which (a, b) is mapped to (1/b, 1/a), the transformation + * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, + * this becomes x -> 4a/x - 1. + * + * + * + * SPEED: + * + * Taking advantage of the recurrence properties of the + * Chebyshev polynomials, the routine requires one more + * addition per loop than evaluating a nested polynomial of + * the same degree. + * + */ + +template +struct pchebevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits::type coef[]) { + typedef typename unpacket_traits::type Scalar; + Packet b0 = pset1(coef[0]); + Packet b1 = pset1(static_cast(0.f)); + Packet b2; + + for (int i = 1; i < N; i++) { + b2 = b1; + b1 = b0; + b0 = psub(pmadd(x, b1, pset1(coef[i])), b2); + } + + return pmul(pset1(static_cast(0.5f)), psub(b0, b2)); + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H diff --git a/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h new file mode 100644 index 00000000..177a04e9 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H + +namespace Eigen { +namespace internal { + +// Forward declarations of the generic math functions +// implemented in GenericPacketMathFunctions.h +// This is needed to workaround a circular dependency. + +/*************************************************************************** + * Some generic implementations to be used by implementors +***************************************************************************/ + +/** Default implementation of pfrexp. + * It is expected to be called by implementers of template<> pfrexp. + */ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic(const Packet& a, Packet& exponent); + +// Extracts the biased exponent value from Packet p, and casts the results to +// a floating-point Packet type. Used by pfrexp_generic. Override this if +// there is no unpacket_traits::integer_packet. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic_get_biased_exponent(const Packet& p); + +/** Default implementation of pldexp. + * It is expected to be called by implementers of template<> pldexp. + */ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pldexp_generic(const Packet& a, const Packet& exponent); + +/** \internal \returns log(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x); + +/** \internal \returns log2(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_float(const Packet _x); + +/** \internal \returns log(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_double(const Packet _x); + +/** \internal \returns log2(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_double(const Packet _x); + +/** \internal \returns log(1 + x) */ +template +Packet generic_plog1p(const Packet& x); + +/** \internal \returns exp(x)-1 */ +template +Packet generic_expm1(const Packet& x); + +/** \internal \returns exp(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_float(const Packet _x); + +/** \internal \returns exp(x) for double precision real numbers */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_double(const Packet _x); + +/** \internal \returns sin(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& x); + +/** \internal \returns cos(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pcos_float(const Packet& x); + +/** \internal \returns sqrt(x) for complex types */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& a); + +template struct ppolevl; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H diff --git a/inst/include/Eigen/src/Core/arch/Default/Half.h b/inst/include/Eigen/src/Core/arch/Default/Half.h new file mode 100644 index 00000000..9f8e8cc1 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/Default/Half.h @@ -0,0 +1,942 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + + +#ifndef EIGEN_HALF_H +#define EIGEN_HALF_H + +#include + +#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +// When compiling with GPU support, the "__half_raw" base class as well as +// some other routines are defined in the GPU compiler header files +// (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr +// As a consequence, we get compile failures when compiling Eigen with +// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building +// Eigen with GPU support + #pragma push_macro("EIGEN_CONSTEXPR") + #undef EIGEN_CONSTEXPR + #define EIGEN_CONSTEXPR +#endif + +#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \ + template <> \ + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED \ + PACKET_F16 METHOD(const PACKET_F16& _x) { \ + return float2half(METHOD(half2float(_x))); \ + } + +namespace Eigen { + +struct half; + +namespace half_impl { + +// We want to use the __half_raw struct from the HIP header file only during the device compile phase. +// This is required because of a quirk in the way TensorFlow GPU builds are done. +// When compiling TensorFlow source code with GPU support, files that +// * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc +// * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically) +// +// Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that +// * are defined in a file that gets compiled via hipcc AND +// * have Eigen::half as a pass-by-value argument AND +// * are called in a file that gets compiled via gcc +// +// In the scenario described above the caller and callee will see different versions +// of the Eigen::half base class __half_raw, and they will be compiled by different compilers +// +// There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in +// the callee getting corrupted values for the Eigen::half argument. +// +// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves +// this error, and hence the following convoluted #if condition +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { +#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)) + // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF) + // The element type for shared memory cannot have non-trivial constructors + // and hence the following special casing (which skips the zero-initilization). + // Note that this check gets done even in the host compilation phase, and + // hence the need for this + EIGEN_DEVICE_FUNC __half_raw() {} +#else + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {} +#endif +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) { + } + __fp16 x; +#else + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {} + numext::uint16_t x; +#endif +}; + +#elif defined(EIGEN_HAS_HIP_FP16) + // Nothing to do here + // HIP fp16 header file has a definition for __half_raw +#elif defined(EIGEN_HAS_CUDA_FP16) + #if EIGEN_CUDA_SDK_VER < 90000 + // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw + typedef __half __half_raw; + #endif // defined(EIGEN_HAS_CUDA_FP16) +#elif defined(SYCL_DEVICE_ONLY) + typedef cl::sycl::half __half_raw; +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); + +struct half_base : public __half_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); } + #elif defined(EIGEN_HAS_CUDA_FP16) + #if EIGEN_CUDA_SDK_VER >= 90000 + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif + #endif +#endif +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { + + // Writing this out as separate #if-else blocks to make the code easier to follow + // The same applies to most #if-else blocks in this file +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) + // Use the same base class for the following two scenarios + // * when compiling without GPU support enabled + // * during host compile phase when compiling with GPU support enabled + typedef half_impl::__half_raw __half_raw; +#elif defined(EIGEN_HAS_HIP_FP16) + // Nothing to do here + // HIP fp16 header file has a definition for __half_raw +#elif defined(EIGEN_HAS_CUDA_FP16) + // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so + // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within + // #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif +#endif + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {} + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + #elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + #endif + #endif +#endif + + + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template + explicit EIGEN_DEVICE_FUNC half(T val) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + // Following the convention of numpy, converting between complex and + // float will lead to loss of imag value. + template + explicit EIGEN_DEVICE_FUNC half(std::complex c) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(c.real()))) {} + + EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. + return half_impl::half_to_float(*this); + } + +#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE) + EIGEN_DEVICE_FUNC operator __half() const { + ::__half_raw hr; + hr.x = x; + return __half(hr); + } +#endif +}; + +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + +namespace half_impl { + +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) +// Note: We deliberatly do *not* define this to 1 even if we have Arm's native +// fp16 type since GPU halfs are rather different from native CPU halfs. +// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16 +#define EIGEN_HAS_NATIVE_FP16 +#endif + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +#if defined(EIGEN_HAS_NATIVE_FP16) +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + return __hadd(::__half(a), ::__half(b)); +#else + return __hadd(a, b); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + return __hdiv(a, b); +#else + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { + return __hneg(a); +} +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} +#endif + +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(vaddh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(vmulh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(vsubh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(vdivh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + return half(vnegh_f16(a.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(vaddh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(vmulh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(vsubh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(vdivh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return vceqh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return !vceqh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return vclth_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return vcleh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return vcgth_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return vcgeh_f16(a.x, b.x); +} +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +// We need to provide emulated *host-side* FP16 operators for clang. +#pragma push_macro("EIGEN_DEVICE_FUNC") +#undef EIGEN_DEVICE_FUNC +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) +#define EIGEN_DEVICE_FUNC __host__ +#else // both host and device need emulated ops. +#define EIGEN_DEVICE_FUNC __host__ __device__ +#endif +#endif + +// Definitions for CPUs and older HIP+CUDA, mostly working through conversion +// to/from fp32. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#if defined(__clang__) && defined(__CUDA__) +#pragma pop_macro("EIGEN_DEVICE_FUNC") +#endif +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return half(static_cast(a) / static_cast(b)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) { + a += half(1); + return a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) { + a -= half(1); + return a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) { + half original_value = a; + ++a; + return original_value; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) { + half original_value = a; + --a; + return original_value; +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) { + // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type + // in the hip_fp16 header file, and that will trigger a compile error + // On the other hand, having anything but a return statement also triggers a compile error + // because this is constexpr function. + // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out + // of this catch22 by having separate bodies for GPU / non GPU +#if defined(EIGEN_HAS_GPU_FP16) + __half_raw h; + h.x = x; + return h; +#else + return __half_raw(x); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) { + // HIP/CUDA/Default have a member 'x' of type uint16_t. + // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast. + // For SYCL, cl::sycl::half is _Float16, so cast directly. +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return numext::bit_cast(h.x); +#elif defined(SYCL_DEVICE_ONLY) + return numext::bit_cast(h); +#else + return h.x; +#endif +} + +union float32_bits { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; + +#elif defined(EIGEN_HAS_FP16_C) + __half_raw h; + h.x = _cvtss_sh(ff, 0); + return h; + +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + __half_raw h; + h.x = static_cast<__fp16>(ff); + return h; + +#else + float32_bits f; f.f = ff; + + const float32_bits f32infty = { 255 << 23 }; + const float32_bits f16max = { (127 + 16) << 23 }; + const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half_raw o; + o.x = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but + // without arithmetic overflow. + f.u += 0xc8000fffU; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + + o.x |= static_cast(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __half2float(h); +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return static_cast(h.x); +#else + const float32_bits magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + float32_bits o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { +#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + return (numext::bit_cast(a.x) & 0x7fff) == 0x7c00; +#else + return (a.x & 0x7fff) == 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hisnan(a); +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return (numext::bit_cast(a.x) & 0x7fff) > 0x7c00; +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return half(vabsh_f16(a.x)); +#else + half result; + result.x = a.x & 0x7FFF; + return result; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return half(::hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) { + return half(static_cast(EIGEN_LOG2E) * ::logf(float(a))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { + return half(::asinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { + return half(::acosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { + return half(::rintf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { + return half(::roundf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) { + return half(::fmodf(float(a), float(b))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +#endif +} + +#ifndef EIGEN_NO_IO +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { + os << static_cast(v); + return os; +} +#endif + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template<> +struct random_default_impl +{ + static inline half run(const half& x, const half& y) + { + return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() + { + return run(half(-1.f), half(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { + return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7e00); + } +}; + +} // end namespace Eigen + +#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + #pragma pop_macro("EIGEN_CONSTEXPR") +#endif + +namespace Eigen { +namespace numext { + +#if defined(EIGEN_GPU_COMPILE_PHASE) + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +#endif + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast(const uint16_t& src) { + return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src)); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(const Eigen::half& src) { + return Eigen::half_impl::raw_half_as_uint16(src); +} + +} // namespace numext +} // namespace Eigen + +// Add the missing shfl* intrinsics. +// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. +// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) +// +// HIP and CUDA prior to SDK 9.0 define +// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float +// CUDA since 9.0 deprecates those and instead defines +// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, +// with native support for __half and __nv_bfloat16 +// +// Note that the following are __device__ - only functions. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \ + || defined(EIGEN_HIPCC) + +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_sync(mask, h, srcLane, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_up_sync(mask, h, delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_down_sync(mask, h, delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_xor_sync(mask, h, laneMask, width)); +} + +#else // HIP or CUDA SDK < 9.0 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl(ivar, srcLane, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_up(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_down(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_xor(ivar, laneMask, width))); +} + +#endif // HIP vs CUDA +#endif // __shfl* + +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \ + || defined(EIGEN_HIPCC) +EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast(ptr))); +} +#endif // __ldg + +#if EIGEN_HAS_STD_HASH +namespace std { +template <> +struct hash { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast(Eigen::numext::bit_cast(a)); + } +}; +} // end namespace std +#endif + +#endif // EIGEN_HALF_H diff --git a/inst/include/Eigen/src/Core/arch/Default/Settings.h b/inst/include/Eigen/src/Core/arch/Default/Settings.h index 097373c8..a5c3ada4 100644 --- a/inst/include/Eigen/src/Core/arch/Default/Settings.h +++ b/inst/include/Eigen/src/Core/arch/Default/Settings.h @@ -21,7 +21,7 @@ * it does not correspond to the number of iterations or the number of instructions */ #ifndef EIGEN_UNROLLING_LIMIT -#define EIGEN_UNROLLING_LIMIT 100 +#define EIGEN_UNROLLING_LIMIT 110 #endif /** Defines the threshold between a "small" and a "large" matrix. diff --git a/inst/include/Eigen/src/Core/arch/Default/TypeCasting.h b/inst/include/Eigen/src/Core/arch/Default/TypeCasting.h new file mode 100644 index 00000000..fb8183b7 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/Default/TypeCasting.h @@ -0,0 +1,120 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2019 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERIC_TYPE_CASTING_H +#define EIGEN_GENERIC_TYPE_CASTING_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(static_cast(a)); + #else + return Eigen::half(static_cast(a)); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __half2float(a); + #else + return static_cast(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::bfloat16 result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const float& a) const { + return Eigen::bfloat16(a); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::bfloat16 result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const int& a) const { + return Eigen::bfloat16(static_cast(a)); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::bfloat16& a) const { + return static_cast(a); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +} +} + +#endif // EIGEN_GENERIC_TYPE_CASTING_H diff --git a/inst/include/Eigen/src/Core/arch/CUDA/MathFunctions.h b/inst/include/Eigen/src/Core/arch/GPU/MathFunctions.h similarity index 82% rename from inst/include/Eigen/src/Core/arch/CUDA/MathFunctions.h rename to inst/include/Eigen/src/Core/arch/GPU/MathFunctions.h index 0348b41d..d2b3a256 100644 --- a/inst/include/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/inst/include/Eigen/src/Core/arch/GPU/MathFunctions.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H -#define EIGEN_MATH_FUNCTIONS_CUDA_H +#ifndef EIGEN_MATH_FUNCTIONS_GPU_H +#define EIGEN_MATH_FUNCTIONS_GPU_H namespace Eigen { @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog(const float4& a) { @@ -56,6 +56,18 @@ double2 pexp(const double2& a) return make_double2(exp(a.x), exp(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pexpm1(const float4& a) +{ + return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pexpm1(const double2& a) +{ + return make_double2(expm1(a.x), expm1(a.y)); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psqrt(const float4& a) { @@ -88,4 +100,4 @@ double2 prsqrt(const double2& a) } // end namespace Eigen -#endif // EIGEN_MATH_FUNCTIONS_CUDA_H +#endif // EIGEN_MATH_FUNCTIONS_GPU_H diff --git a/inst/include/Eigen/src/Core/arch/GPU/PacketMath.h b/inst/include/Eigen/src/Core/arch/GPU/PacketMath.h new file mode 100644 index 00000000..689110de --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/GPU/PacketMath.h @@ -0,0 +1,1685 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_GPU_H +#define EIGEN_PACKET_MATH_GPU_H + +namespace Eigen { + +namespace internal { + +// Read-only data cached load available. +#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) +#define EIGEN_GPU_HAS_LDG 1 +#endif + +// FP16 math available. +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) +#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1 +#endif + +#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) +#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1 +#endif + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) + +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasNdtri = 1, + HasBessel = 1, + HasIGamma = 1, + HasIGammaDerA = 1, + HasGammaSampleDerAlpha = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + HasFloor = 1, + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasNdtri = 1, + HasBessel = 1, + HasIGamma = 1, + HasIGammaDerA = 1, + HasGammaSampleDerAlpha = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + HasFloor = 1, + }; +}; + + +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { + return make_double2(from, from); +} + +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) +namespace { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) | __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) | + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) ^ __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) ^ + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & ~__float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + ~__double_as_longlong(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, + const float& b) { + return __int_as_float(a == b ? 0xffffffffu : 0u); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, + const double& b) { + return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, + const float& b) { + return __int_as_float(a < b ? 0xffffffffu : 0u); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, + const double& b) { + return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); +} + +} // namespace + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, + const float4& b) { + return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), + bitwise_and(a.z, b.z), bitwise_and(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand(const double2& a, + const double2& b) { + return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por(const float4& a, + const float4& b) { + return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), + bitwise_or(a.z, b.z), bitwise_or(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por(const double2& a, + const double2& b) { + return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor(const float4& a, + const float4& b) { + return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), + bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor(const double2& a, + const double2& b) { + return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot(const float4& a, + const float4& b) { + return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), + bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pandnot(const double2& a, const double2& b) { + return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq(const float4& a, + const float4& b) { + return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), + eq_mask(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt(const float4& a, + const float4& b) { + return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), + lt_mask(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_eq(const double2& a, const double2& b) { + return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_lt(const double2& a, const double2& b) { + return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y)); +} +#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return make_double2(__ldg(from+0), __ldg(from+1)); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { + return fmin(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { + return a.x * a.y * a.z * a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { + return a.x * a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(fabs(a.x), fabs(a.y)); +} + +template<> EIGEN_DEVICE_FUNC inline float4 pfloor(const float4& a) { + return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pfloor(const double2& a) { + return make_double2(floor(a.x), floor(a.y)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) + +// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning +// its corresponding packet_traits must be visible on host. +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + +typedef ulonglong2 Packet4h2; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4h2 type; + typedef Packet4h2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +namespace { +// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __halves2half2(a, b); +#else + // Round-about way since __halves2half2 is a __device__ function. + return __floats2half2_rn(__half2float(a), __half2float(b)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __low2half(a); +#else + return __float2half(__low2float(a)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __high2half(a); +#else + return __float2half(__high2float(a)); +#endif +} +} // namespace + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __half2half2(from); +#else + const float f = __half2float(from); + return __floats2half2_rn(f, f); +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pset1(const Eigen::half& from) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = pset1(from); + p_alias[1] = pset1(from); + p_alias[2] = pset1(from); + p_alias[3] = pset1(from); + return r; +} + +// We now need this visible on both host and device. +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) +namespace { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { + return combine_half(from[0], from[1]); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { + return combine_half(from[0], from[0]); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, + const half2& from) { + *reinterpret_cast(to) = from; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, + const half2& from) { + to[0] = get_half2_low(from); + to[1] = get_half2_high(from); +} + + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( + const Eigen::half* from) { +#if defined(EIGEN_GPU_HAS_LDG) + // Input is guaranteed to be properly aligned. + return __ldg(reinterpret_cast(from)); +#else + return combine_half(*(from+0), *(from+1)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( + const Eigen::half* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return combine_half(*(from+0), *(from+1)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, + Index stride) { + return combine_half(from[0*stride], from[1*stride]); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( + Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = get_half2_low(from); + to[stride*1] = get_half2_high(from); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { + return get_half2_low(a); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); + half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + return pset1(true_half); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { + half false_half = half_impl::raw_uint16_to_half(0x0000u); + return pset1(false_half); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __half a1 = get_half2_low(kernel.packet[0]); + __half a2 = get_half2_high(kernel.packet[0]); + __half b1 = get_half2_low(kernel.packet[1]); + __half b2 = get_half2_high(kernel.packet[1]); + kernel.packet[0] = combine_half(a1, b1); + kernel.packet[1] = combine_half(a2, b2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +#else + float f = __half2float(a) + 1.0f; + return combine_half(a, __float2half(f)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, + const half2& a, + const half2& b) { + half mask_low = get_half2_low(mask); + half mask_high = get_half2_high(mask); + half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); + half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); + return combine_half(result_low, result_high); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; + return combine_half(eq1, eq2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; + return combine_half(eq1, eq2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hsub2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 - b1; + float r2 = a2 - b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hneg2(a); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return __floats2half2_rn(-a1, -a2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, + const half2& b, + const half2& c) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hfma2(a, b, c); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float c1 = __low2float(c); + float c2 = __high2float(c); + float r1 = a1 * b1 + c1; + float r2 = a2 * b2 + c2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __h2div(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hadd(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half(a1 + a2)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 > a2 ? get_half2_low(a) : get_half2_high(a); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 < a2 ? get_half2_low(a) : get_half2_high(a); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hmul(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half(a1 * a2)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expm1f(a1); + float r2 = expm1f(a2); + return __floats2half2_rn(r1, r2); +} + +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 plog(const half2& a) { + return h2log(a); +} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 pexp(const half2& a) { + return h2exp(a); +} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 psqrt(const half2& a) { + return h2sqrt(a); +} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 prsqrt(const half2& a) { + return h2rsqrt(a); +} + +#else + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = logf(a1); + float r2 = logf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expf(a1); + float r2 = expf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = sqrtf(a1); + float r2 = sqrtf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = rsqrtf(a1); + float r2 = rsqrtf(a2); + return __floats2half2_rn(r1, r2); +} +#endif +} // namespace + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +// unaligned load; +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +ploadu(const Eigen::half* from) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = ploadu(from + 0); + p_alias[1] = ploadu(from + 2); + p_alias[2] = ploadu(from + 4); + p_alias[3] = ploadu(from + 6); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +ploaddup(const Eigen::half* from) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = ploaddup(from + 0); + p_alias[1] = ploaddup(from + 1); + p_alias[2] = ploaddup(from + 2); + p_alias[3] = ploaddup(from + 3); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore( + Eigen::half* to, const Packet4h2& from) { + *reinterpret_cast(to) = from; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu( + Eigen::half* to, const Packet4h2& from) { + const half2* from_alias = reinterpret_cast(&from); + pstoreu(to + 0,from_alias[0]); + pstoreu(to + 2,from_alias[1]); + pstoreu(to + 4,from_alias[2]); + pstoreu(to + 6,from_alias[3]); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 +ploadt_ro(const Eigen::half* from) { +#if defined(EIGEN_GPU_HAS_LDG) + Packet4h2 r; + r = __ldg(reinterpret_cast(from)); + return r; +#else + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + r_alias[0] = ploadt_ro_aligned(from + 0); + r_alias[1] = ploadt_ro_aligned(from + 2); + r_alias[2] = ploadt_ro_aligned(from + 4); + r_alias[3] = ploadt_ro_aligned(from + 6); + return r; +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 +ploadt_ro(const Eigen::half* from) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + r_alias[0] = ploadt_ro_unaligned(from + 0); + r_alias[1] = ploadt_ro_unaligned(from + 2); + r_alias[2] = ploadt_ro_unaligned(from + 4); + r_alias[3] = ploadt_ro_unaligned(from + 6); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pgather(const Eigen::half* from, Index stride) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); + p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); + p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); + p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( + Eigen::half* to, const Packet4h2& from, Index stride) { + const half2* from_alias = reinterpret_cast(&from); + pscatter(to + stride * 0, from_alias[0], stride); + pscatter(to + stride * 2, from_alias[1], stride); + pscatter(to + stride * 4, from_alias[2], stride); + pscatter(to + stride * 6, from_alias[3], stride); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst( + const Packet4h2& a) { + return pfirst(*(reinterpret_cast(&a))); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs( + const Packet4h2& a) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + p_alias[0] = pabs(a_alias[0]); + p_alias[1] = pabs(a_alias[1]); + p_alias[2] = pabs(a_alias[2]); + p_alias[3] = pabs(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue( + const Packet4h2& /*a*/) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + return pset1(true_half); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero(const Packet4h2& /*a*/) { + half false_half = half_impl::raw_uint16_to_half(0x0000u); + return pset1(false_half); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double( + double* d_row0, double* d_row1, double* d_row2, double* d_row3, + double* d_row4, double* d_row5, double* d_row6, double* d_row7) { + double d_tmp; + d_tmp = d_row0[1]; + d_row0[1] = d_row4[0]; + d_row4[0] = d_tmp; + + d_tmp = d_row1[1]; + d_row1[1] = d_row5[0]; + d_row5[0] = d_tmp; + + d_tmp = d_row2[1]; + d_row2[1] = d_row6[0]; + d_row6[0] = d_tmp; + + d_tmp = d_row3[1]; + d_row3[1] = d_row7[0]; + d_row7[0] = d_tmp; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( + half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) { + half2 f_tmp; + f_tmp = f_row0[1]; + f_row0[1] = f_row2[0]; + f_row2[0] = f_tmp; + + f_tmp = f_row1[1]; + f_row1[1] = f_row3[0]; + f_row3[0] = f_tmp; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose_half(half2& f0, half2& f1) { + __half a1 = get_half2_low(f0); + __half a2 = get_half2_high(f0); + __half b1 = get_half2_low(f1); + __half b2 = get_half2_high(f1); + f0 = combine_half(a1, b1); + f1 = combine_half(a2, b2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + double* d_row0 = reinterpret_cast(&kernel.packet[0]); + double* d_row1 = reinterpret_cast(&kernel.packet[1]); + double* d_row2 = reinterpret_cast(&kernel.packet[2]); + double* d_row3 = reinterpret_cast(&kernel.packet[3]); + double* d_row4 = reinterpret_cast(&kernel.packet[4]); + double* d_row5 = reinterpret_cast(&kernel.packet[5]); + double* d_row6 = reinterpret_cast(&kernel.packet[6]); + double* d_row7 = reinterpret_cast(&kernel.packet[7]); + ptranspose_double(d_row0, d_row1, d_row2, d_row3, + d_row4, d_row5, d_row6, d_row7); + + + half2* f_row0 = reinterpret_cast(d_row0); + half2* f_row1 = reinterpret_cast(d_row1); + half2* f_row2 = reinterpret_cast(d_row2); + half2* f_row3 = reinterpret_cast(d_row3); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + + f_row0 = reinterpret_cast(d_row0 + 1); + f_row1 = reinterpret_cast(d_row1 + 1); + f_row2 = reinterpret_cast(d_row2 + 1); + f_row3 = reinterpret_cast(d_row3 + 1); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + + f_row0 = reinterpret_cast(d_row4); + f_row1 = reinterpret_cast(d_row5); + f_row2 = reinterpret_cast(d_row6); + f_row3 = reinterpret_cast(d_row7); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + + f_row0 = reinterpret_cast(d_row4 + 1); + f_row1 = reinterpret_cast(d_row5 + 1); + f_row2 = reinterpret_cast(d_row6 + 1); + f_row3 = reinterpret_cast(d_row7 + 1); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +plset(const Eigen::half& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f))); + p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), + __hadd(a, __float2half(3.0f))); + p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), + __hadd(a, __float2half(5.0f))); + p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), + __hadd(a, __float2half(7.0f))); + return r; +#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + + half2 b = pset1(a); + half2 c; + half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f)); + half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f)); + + c = __hadd2(b, half_offset0); + r_alias[0] = plset(__low2half(c)); + r_alias[1] = plset(__high2half(c)); + + c = __hadd2(b, half_offset1); + r_alias[2] = plset(__low2half(c)); + r_alias[3] = plset(__high2half(c)); + + return r; + +#else + float f = __half2float(a); + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = combine_half(a, __float2half(f + 1.0f)); + p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); + return r; +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pselect(const Packet4h2& mask, const Packet4h2& a, + const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* mask_alias = reinterpret_cast(&mask); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]); + r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]); + r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]); + r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pcmp_eq(const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]); + r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]); + r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]); + r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pand(a_alias[0], b_alias[0]); + r_alias[1] = pand(a_alias[1], b_alias[1]); + r_alias[2] = pand(a_alias[2], b_alias[2]); + r_alias[3] = pand(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = por(a_alias[0], b_alias[0]); + r_alias[1] = por(a_alias[1], b_alias[1]); + r_alias[2] = por(a_alias[2], b_alias[2]); + r_alias[3] = por(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pxor(a_alias[0], b_alias[0]); + r_alias[1] = pxor(a_alias[1], b_alias[1]); + r_alias[2] = pxor(a_alias[2], b_alias[2]); + r_alias[3] = pxor(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pandnot(const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pandnot(a_alias[0], b_alias[0]); + r_alias[1] = pandnot(a_alias[1], b_alias[1]); + r_alias[2] = pandnot(a_alias[2], b_alias[2]); + r_alias[3] = pandnot(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = padd(a_alias[0], b_alias[0]); + r_alias[1] = padd(a_alias[1], b_alias[1]); + r_alias[2] = padd(a_alias[2], b_alias[2]); + r_alias[3] = padd(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = psub(a_alias[0], b_alias[0]); + r_alias[1] = psub(a_alias[1], b_alias[1]); + r_alias[2] = psub(a_alias[2], b_alias[2]); + r_alias[3] = psub(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = pnegate(a_alias[0]); + r_alias[1] = pnegate(a_alias[1]); + r_alias[2] = pnegate(a_alias[2]); + r_alias[3] = pnegate(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) { + return a; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pmul(a_alias[0], b_alias[0]); + r_alias[1] = pmul(a_alias[1], b_alias[1]); + r_alias[2] = pmul(a_alias[2], b_alias[2]); + r_alias[3] = pmul(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd( + const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + const half2* c_alias = reinterpret_cast(&c); + r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]); + r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]); + r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]); + r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pdiv(a_alias[0], b_alias[0]); + r_alias[1] = pdiv(a_alias[1], b_alias[1]); + r_alias[2] = pdiv(a_alias[2], b_alias[2]); + r_alias[3] = pdiv(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pmin(a_alias[0], b_alias[0]); + r_alias[1] = pmin(a_alias[1], b_alias[1]); + r_alias[2] = pmin(a_alias[2], b_alias[2]); + r_alias[3] = pmin(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pmax(a_alias[0], b_alias[0]); + r_alias[1] = pmax(a_alias[1], b_alias[1]); + r_alias[2] = pmax(a_alias[2], b_alias[2]); + r_alias[3] = pmax(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + + return predux(a_alias[0]) + predux(a_alias[1]) + + predux(a_alias[2]) + predux(a_alias[3]); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + half2 m0 = combine_half(predux_max(a_alias[0]), + predux_max(a_alias[1])); + half2 m1 = combine_half(predux_max(a_alias[2]), + predux_max(a_alias[3])); + __half first = predux_max(m0); + __half second = predux_max(m1); +#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) + return (__hgt(first, second) ? first : second); +#else + float ffirst = __half2float(first); + float fsecond = __half2float(second); + return (ffirst > fsecond)? first: second; +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + half2 m0 = combine_half(predux_min(a_alias[0]), + predux_min(a_alias[1])); + half2 m1 = combine_half(predux_min(a_alias[2]), + predux_min(a_alias[3])); + __half first = predux_min(m0); + __half second = predux_min(m1); +#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) + return (__hlt(first, second) ? first : second); +#else + float ffirst = __half2float(first); + float fsecond = __half2float(second); + return (ffirst < fsecond)? first: second; +#endif +} + +// likely overflow/underflow +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), + pmul(a_alias[2], a_alias[3]))); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +plog1p(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = plog1p(a_alias[0]); + r_alias[1] = plog1p(a_alias[1]); + r_alias[2] = plog1p(a_alias[2]); + r_alias[3] = plog1p(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pexpm1(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = pexpm1(a_alias[0]); + r_alias[1] = pexpm1(a_alias[1]); + r_alias[2] = pexpm1(a_alias[2]); + r_alias[3] = pexpm1(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = plog(a_alias[0]); + r_alias[1] = plog(a_alias[1]); + r_alias[2] = plog(a_alias[2]); + r_alias[3] = plog(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = pexp(a_alias[0]); + r_alias[1] = pexp(a_alias[1]); + r_alias[2] = pexp(a_alias[2]); + r_alias[3] = pexp(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = psqrt(a_alias[0]); + r_alias[1] = psqrt(a_alias[1]); + r_alias[2] = psqrt(a_alias[2]); + r_alias[3] = psqrt(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +prsqrt(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = prsqrt(a_alias[0]); + r_alias[1] = prsqrt(a_alias[1]); + r_alias[2] = prsqrt(a_alias[2]); + r_alias[3] = prsqrt(a_alias[3]); + return r; +} + +// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for +// the implementation of GPU half reduction. +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __h2div(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) + +#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + +#undef EIGEN_GPU_HAS_LDG +#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC +#undef EIGEN_GPU_HAS_FP16_ARITHMETIC + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_GPU_H diff --git a/inst/include/Eigen/src/Core/arch/GPU/TypeCasting.h b/inst/include/Eigen/src/Core/arch/GPU/TypeCasting.h new file mode 100644 index 00000000..75454622 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -0,0 +1,80 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_GPU_H +#define EIGEN_TYPE_CASTING_GPU_H + +namespace Eigen { + +namespace internal { + +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast(const float4& a, const float4& b) { + Packet4h2 r; + half2* r_alias=reinterpret_cast(&r); + r_alias[0]=__floats2half2_rn(a.x,a.y); + r_alias[1]=__floats2half2_rn(a.z,a.w); + r_alias[2]=__floats2half2_rn(b.x,b.y); + r_alias[3]=__floats2half2_rn(b.z,b.w); + return r; +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const Packet4h2& a) { + // Simply discard the second half of the input + float4 r; + const half2* a_alias=reinterpret_cast(&a); + float2 r1 = __half22float2(a_alias[0]); + float2 r2 = __half22float2(a_alias[1]); + r.x=static_cast(r1.x); + r.y=static_cast(r1.y); + r.z=static_cast(r2.x); + r.w=static_cast(r2.y); + return r; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { + // Simply discard the second half of the input + return __floats2half2_rn(a.x, a.y); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_GPU_H diff --git a/inst/include/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/inst/include/Eigen/src/Core/arch/HIP/hcc/math_constants.h new file mode 100644 index 00000000..25375a0a --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/HIP/hcc/math_constants.h @@ -0,0 +1,23 @@ +/* + * math_constants.h - + * HIP equivalent of the CUDA header of the same name + */ + +#ifndef __MATH_CONSTANTS_H__ +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ + +#define HIPRT_INF_F __int_as_float(0x7f800000) +#define HIPRT_NAN_F __int_as_float(0x7fffffff) +#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) +#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) +#define HIPRT_ZERO_F 0.0f +#define HIPRT_ONE_F 1.0f + +/* double precision constants */ +#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) +#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) + +#endif diff --git a/inst/include/Eigen/src/Core/arch/MSA/Complex.h b/inst/include/Eigen/src/Core/arch/MSA/Complex.h new file mode 100644 index 00000000..53dacfa4 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/MSA/Complex.h @@ -0,0 +1,648 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_MSA_H +#define EIGEN_COMPLEX_MSA_H + +#include + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet2cf { + EIGEN_STRONG_INLINE Packet2cf() { + } + EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex& a, + const std::complex& b) { + Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) }; + v = t; + } + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet2cf conjugate(void) const { + return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63)); + } + EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { + Packet4f v1, v2; + + // Get the real values of a | a1_re | a1_re | a2_re | a2_re | + v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v); + // Get the imag values of a | a1_im | a1_im | a2_im | a2_im | + v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet2cf(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { + return Packet2cf(*this) *= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { + return Packet2cf(*this) += b; + } + EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { + return Packet2cf(*this) -= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { + *this *= b.conjugate(); + Packet4f s = pmul(b.v, b.v); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { + return Packet2cf(*this) /= b; + } + EIGEN_STRONG_INLINE Packet2cf operator-(void) const { + return Packet2cf(pnegate(v)); + } + + Packet4f v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] + << "i)," + " (" + << value.v[2] << ", " << value.v[3] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet2cf half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + float f0 = from.real(), f1 = from.imag(); + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { + EIGEN_MSA_DEBUG; + + return Packet2cf(from[0 * stride], from[1 * stride]); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, + const Packet2cf& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = std::complex(from.v[0], from.v[1]); + to += stride; + *to = std::complex(from.v[2], from.v[3]); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + Packet4f value = (Packet4f)preverse((Packet2d)a.v); + value += a.v; + return std::complex(value[0], value[1]); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), + (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) + +template <> +EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet4f tmp = + (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[0].v = + (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[1].v = tmp; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, + const Packet2cf& elsePacket) { + return (Packet2cf)(Packet4f)pblend(ifPacket, (Packet2d)thenPacket.v, + (Packet2d)elsePacket.v); +} + +//---------- double ---------- + +struct Packet1cd { + EIGEN_STRONG_INLINE Packet1cd() { + } + EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex& a) { + v[0] = std::real(a); + v[1] = std::imag(a); + } + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet1cd conjugate(void) const { + static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 }; + return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR); + } + EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { + Packet2d v1, v2; + + // Get the real values of a | a1_re | a1_re + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v); + // Get the imag values of a | a1_im | a1_im + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet1cd(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { + return Packet1cd(*this) *= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { + return Packet1cd(*this) += b; + } + EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { + return Packet1cd(*this) -= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) { + *this *= b.conjugate(); + Packet2d s = pmul(b.v, b.v); + s = padd(s, preverse(s)); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { + return Packet1cd(*this) /= b; + } + EIGEN_STRONG_INLINE Packet1cd operator-(void) const { + return Packet1cd(pnegate(v)); + } + + Packet2d v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet1cd half; +}; + +template <> +EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + return Packet1cd(from); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + Packet1cd res; + res.v[0] = std::real(from[0]); + res.v[1] = std::imag(from[0]); + return res; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, + const Packet1cd& from, + Index stride + __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + pstore(to, from); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) + +template <> +EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip /**/ (const Packet1cd& x) { + EIGEN_MSA_DEBUG; + + return Packet1cd(preverse(Packet2d(x.v))); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d v1, v2; + + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + // Get the imag values of a + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + + kernel.packet[0].v = v1; + kernel.packet[1].v = v2; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_MSA_H diff --git a/inst/include/Eigen/src/Core/arch/MSA/MathFunctions.h b/inst/include/Eigen/src/Core/arch/MSA/MathFunctions.h new file mode 100644 index 00000000..f5181b90 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/MSA/MathFunctions.h @@ -0,0 +1,387 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2016 Gael Guennebaud +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +/* The tanh function of this file is an adaptation of + * template T generic_fast_tanh_float(const T&) + * from MathFunctionsImpl.h. + */ + +#ifndef EIGEN_MATH_FUNCTIONS_MSA_H +#define EIGEN_MATH_FUNCTIONS_MSA_H + +namespace Eigen { + +namespace internal { + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +plog(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + // Convert negative argument into NAN (quiet negative, to be specific). + Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0); + Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero); + Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero); + Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN. + Packet4f x = non_neg_x_or_nan; + + // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0. + // N.B. the exponent is one less of what frexpf() would return. + Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x)); + // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf(). + x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0)); + + /* + if (x < SQRTHF) { + x = x + x - 1.0; + } else { + e += 1; + x = x - 1.0; + } + */ + Packet4f xx = padd(x, x); + Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x); + e_int = psub(e_int, ge_mask); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x); + x = psub(x, p4f_1); + Packet4f e = __builtin_msa_ffint_s_w(e_int); + + Packet4f x2 = pmul(x, x); + Packet4f x3 = pmul(x2, x); + + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y, x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y = pmadd(e, p4f_cephes_log_q1, y); + x = __builtin_msa_fmsub_w(x, x2, p4f_half); + x = padd(x, y); + x = pmadd(e, p4f_cephes_log_q2, x); + + // x is now the logarithm result candidate. We still need to handle the + // extreme arguments of zero and positive infinity, though. + // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms + // contain infinities of both signs (see the coefficients and code above). + // INFINITY - INFINITY is NAN. + + // If the argument is +INFINITY, make it the new result candidate. + // To achieve that we choose the smaller of the result candidate and the + // argument. + // This is correct for all finite pairs of values (the logarithm is smaller + // than the argument). + // This is also correct in the special case when the argument is +INFINITY + // and the result candidate is NAN. This is because the fmin.df instruction + // prefers non-NANs to NANs. + x = __builtin_msa_fmin_w(x, non_neg_x_or_nan); + + // If the argument is zero (including -0.0), the result becomes -INFINITY. + Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs); + + return x; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pexp(const Packet4f& _x) { + // Limiting single-precision pexp's argument to [-128, +128] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = _x; + + // Clamp x. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, + (v16u8)p4f_exp_lo); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, + (v16u8)p4f_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0); + Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add); + Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2); + Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int); + + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1); + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2); + + Packet4f z = pmul(x, x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // y *= 2**exponent. + y = __builtin_msa_fexp2_w(y, x2_int); + + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f); + // The monomial coefficients of the numerator polynomial (odd). + static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); + // The monomial coefficients of the denominator polynomial (even). + static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f); + + Packet4f x = pabs(_x); + Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny); + + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is -/+1.0f in single-precision. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, + (v16u8)p4f_tanh_hi); + + // Since the polynomials are odd/even, we need x**2. + Packet4f x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11); + p = pmadd(x2, p, p4f_alpha_9); + p = pmadd(x2, p, p4f_alpha_7); + p = pmadd(x2, p, p4f_alpha_5); + p = pmadd(x2, p, p4f_alpha_3); + p = pmadd(x2, p, p4f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial q. + Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4); + q = pmadd(x2, q, p4f_beta_2); + q = pmadd(x2, q, p4f_beta_0); + + // Divide the numerator by the denominator. + p = pdiv(p, q); + + // Reinstate the sign. + p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0); + + // When the argument is very small in magnitude it's more accurate to just return it. + p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x); + + return p; +} + +template +Packet4f psincos_inner_msa_float(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi). + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi. + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = pabs(_x); + + // Translate infinite arguments into NANs. + Packet4f zero_or_nan_if_inf = psub(_x, _x); + x = padd(x, zero_or_nan_if_inf); + // Prevent sin/cos from generating values larger than 1.0 in magnitude + // for very large arguments by setting x to 0.0. + Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg); + x = pand(x, (Packet4f)small_or_nan_mask); + + // Scale x by 4/Pi to find x's octant. + Packet4f y = pmul(x, p4f_cephes_FOPI); + // Get the octant. We'll reduce x by this number of octants or by one more than it. + Packet4i y_int = __builtin_msa_ftrunc_s_w(y); + // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. + // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. + // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). + Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1); + Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear + y = __builtin_msa_ffint_s_w(y_int2); + + // Compute the sign to apply to the polynomial. + Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x) + : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29); + + // Get the polynomial selection mask. + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0); + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. + // The magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3 + Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1); + Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2); + Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, tmp1); + x = padd(x, tmp2); + x = padd(x, tmp3); + + // Evaluate the cos(x) polynomial. + y = p4f_coscof_p0; + Packet4f z = pmul(x, x); + y = pmadd(y, z, p4f_coscof_p1); + y = pmadd(y, z, p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + y = __builtin_msa_fmsub_w(y, z, p4f_half); + y = padd(y, p4f_1); + + // Evaluate the sin(x) polynomial. + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynomials. + y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2) + : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y); + + // Update the sign. + sign_mask = pxor(sign_mask, (Packet4i)y); + y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +psin(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pcos(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d +pexp(const Packet2d& _x) { + // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0); + static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0); + + Packet2d x = _x; + + // Clamp x. + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, + (v16u8)p2d_exp_lo); + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, + (v16u8)p2d_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0); + Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add); + Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2); + Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long); + + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1); + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2); + + x2 = pmul(x, x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul(px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px, psub(qx, px)); + x = pmadd(p2d_2, x, p2d_1); + + // x *= 2**exponent. + x = __builtin_msa_fexp2_d(x, x2_long); + + return x; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_MSA_H diff --git a/inst/include/Eigen/src/Core/arch/MSA/PacketMath.h b/inst/include/Eigen/src/Core/arch/MSA/PacketMath.h new file mode 100644 index 00000000..afe8f337 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/MSA/PacketMath.h @@ -0,0 +1,1233 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_MSA_H +#define EIGEN_PACKET_MATH_MSA_H + +#include +#include + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif + +#if 0 +#define EIGEN_MSA_DEBUG \ + static bool firstTime = true; \ + do { \ + if (firstTime) { \ + std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ + firstTime = false; \ + } \ + } while (0) +#else +#define EIGEN_MSA_DEBUG +#endif + +#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) + +typedef v4f32 Packet4f; +typedef v4i32 Packet4i; +typedef v4u32 Packet4ui; + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4f type; + typedef Packet4f half; // Packet2f intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2f intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4i type; + typedef Packet4i half; // Packet2i intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2i intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet4f half; +}; + +template <> +struct unpacket_traits { + typedef int32_t type; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet4i half; +}; + +template <> +EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + EIGEN_MSA_DEBUG; + + Packet4f v = { from, from, from, from }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload1(const float* from) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload1(const int32_t* from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(*from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f plset(const float& a) { + EIGEN_MSA_DEBUG; + + static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { + EIGEN_MSA_DEBUG; + + static const Packet4i countdown = { 0, 1, 2, 3 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_subv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_mulv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_div_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_w(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + EIGEN_MSA_DEBUG; + + // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug. + Packet4i value = c; + __asm__("maddv.w %w[value], %w[a], %w[b]\n" + // Outputs + : [value] "+f"(value) + // Inputs + : [a] "f"(a), [b] "f"(b)); + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_min_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_max_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { + EIGEN_MSA_DEBUG; + + float f0 = from[0], f1 = from[1]; + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { + EIGEN_MSA_DEBUG; + + int32_t i0 = from[0], i1 = from[1]; + Packet4i v0 = { i0, i0, i0, i0 }; + Packet4i v1 = { i1, i1, i1, i1 }; + return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { + EIGEN_MSA_DEBUG; + + int32_t i = *from; + Packet4i v = { i, i, i, i }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const float* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i zero = __builtin_msa_ldi_w(0); + return __builtin_msa_add_a_w(zero, a); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + + +template <> +EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with min computation. + Packet4f v = __builtin_msa_fmin_w(a, swapped); + v = __builtin_msa_fmin_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +// max +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with max computation. + Packet4f v = __builtin_msa_fmax_w(a, swapped); + v = __builtin_msa_fmax_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + + kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_w(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_w(a); +#else + Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, + const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, + const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +//---------- double ---------- + +typedef v2f64 Packet2d; +typedef v2i64 Packet2l; +typedef v2u64 Packet2ul; + +#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + // FIXME check the Has* + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet2d half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { from, from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d plset(const double& a) { + EIGEN_MSA_DEBUG; + + static const Packet2d countdown = { 0.0, 1.0 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_d(c, a, b); +} + +// Logical Operations are not supported for float, so we have to reinterpret casts using MSA +// intrinsics +template <> +EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { *from, *from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { + EIGEN_MSA_DEBUG; + + Packet2d value; + value[0] = *from; + from += stride; + value[1] = *from; + return value; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const double* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE double predux(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d s = padd(a, preverse(a)); + return s[0]; +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d p = pmul(a, preverse(a)); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmin_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1; +#endif +} + +// max +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmax_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_d(a); +#else + Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, + const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0); + return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_MSA_H diff --git a/inst/include/Eigen/src/Core/arch/NEON/Complex.h b/inst/include/Eigen/src/Core/arch/NEON/Complex.h index 306a309b..f40af7f8 100644 --- a/inst/include/Eigen/src/Core/arch/NEON/Complex.h +++ b/inst/include/Eigen/src/Core/arch/NEON/Complex.h @@ -15,9 +15,10 @@ namespace Eigen { namespace internal { -inline uint32x4_t p4ui_CONJ_XOR() { +inline uint32x4_t p4ui_CONJ_XOR() +{ // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; return ret; #else @@ -26,61 +27,136 @@ inline uint32x4_t p4ui_CONJ_XOR() { #endif } -inline uint32x2_t p2ui_CONJ_XOR() { +inline uint32x2_t p2ui_CONJ_XOR() +{ static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; return vld1_u32( conj_XOR_DATA ); } //---------- float ---------- + +struct Packet1cf +{ + EIGEN_STRONG_INLINE Packet1cf() {} + EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {} + Packet2f v; +}; struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} - Packet4f v; + Packet4f v; }; -template<> struct packet_traits > : default_packet_traits +template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; - typedef Packet2cf half; - enum { + typedef Packet1cf half; + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, HasSetLinear = 0 }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; - -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +template<> struct unpacket_traits { - float32x2_t r64; - r64 = vld1_f32((const float *)&from); + typedef std::complex type; + typedef Packet1cf half; + typedef Packet2f as_real; + enum + { + size = 1, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef std::complex type; + typedef Packet1cf half; + typedef Packet4f as_real; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet1cf pcast(const float& a) +{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); } +template<> EIGEN_STRONG_INLINE Packet2cf pcast(const Packet2f& a) +{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); } +template<> EIGEN_STRONG_INLINE Packet1cf pset1(const std::complex& from) +{ return Packet1cf(vld1_f32(reinterpret_cast(&from))); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + const float32x2_t r64 = vld1_f32(reinterpret_cast(&from)); return Packet2cf(vcombine_f32(r64, r64)); } -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cf padd(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(padd(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf psub(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(psub(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) +{ + const Packet2ui b = vreinterpret_u32_f32(a.v); + return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); +} template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = vreinterpretq_u32_f32(a.v); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } +template<> EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) +{ + Packet2f v1, v2; + + // Get the real values of a | a1_re | a1_re | + v1 = vdup_lane_f32(a.v, 0); + // Get the imag values of a | a1_im | a1_im | + v2 = vdup_lane_f32(a.v, 1); + // Multiply the real a with b + v1 = vmul_f32(v1, b.v); + // Multiply the imag a with b + v2 = vmul_f32(v2, b.v); + // Conjugate v2 + v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); + // Swap real/imag elements in v2. + v2 = vrev64_f32(v2); + // Add and return the result + return Packet1cf(vadd_f32(v1, v2)); +} template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { Packet4f v1, v2; @@ -93,7 +169,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con v1 = vmulq_f32(v1, b.v); // Multiply the imag a with b v2 = vmulq_f32(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64q_f32(v2); @@ -101,98 +177,144 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con return Packet2cf(vaddq_f32(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) +template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0])] + Packet2f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0])] + Packet2f eq_swapped = vrev64_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cf(pand(eq, eq_swapped)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])] + Packet4f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])] + Packet4f eq_swapped = vrev64q_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet2cf(pand(eq, eq_swapped)); } -template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cf pand(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet1cf por(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE Packet1cf pxor(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +template<> EIGEN_STRONG_INLINE Packet1cf pandnot(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cf ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cf ploaddup(const std::complex* from) +{ return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) +{ return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cf& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } + +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cf& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather, Packet1cf>( + const std::complex* from, Index stride) +{ + const Packet2f tmp = vdup_n_f32(std::real(from[0*stride])); + return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1)); +} +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + Packet4f res = vdupq_n_f32(std::real(from[0*stride])); res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); return Packet2cf(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cf>( + std::complex* to, const Packet1cf& from, Index stride) +{ to[stride*0] = std::complex(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); } +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>( + std::complex* to, const Packet2cf& from, Index stride) { to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cf& a) { - std::complex EIGEN_ALIGN16 x[2]; - vst1q_f32((float *)x, a.v); + EIGEN_ALIGN16 std::complex x; + vst1_f32(reinterpret_cast(&x), a.v); + return x; +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + EIGEN_ALIGN16 std::complex x[2]; + vst1q_f32(reinterpret_cast(x), a.v); return x[0]; } +template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - float32x2_t a_lo, a_hi; - Packet4f a_r128; - - a_lo = vget_low_f32(a.v); - a_hi = vget_high_f32(a.v); - a_r128 = vcombine_f32(a_hi, a_lo); - - return Packet2cf(a_r128); -} +{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip(const Packet1cf& a) +{ return Packet1cf(vrev64_f32(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) +{ return Packet2cf(vrev64q_f32(a.v)); } + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cf& a) { - return Packet2cf(vrev64q_f32(a.v)); + std::complex s; + vst1_f32((float *)&s, a.v); + return s; } - template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { - float32x2_t a1, a2; std::complex s; - - a1 = vget_low_f32(a.v); - a2 = vget_high_f32(a.v); - a2 = vadd_f32(a1, a2); - vst1_f32((float *)&s, a2); - + vst1_f32(reinterpret_cast(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v))); return s; } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cf& a) { - Packet4f sum1, sum2, sum; - - // Add the first two 64-bit float32x2_t of vecs[0] - sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); - sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); - sum = vaddq_f32(sum1, sum2); - - return Packet2cf(sum); + std::complex s; + vst1_f32((float *)&s, a.v); + return s; } - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { float32x2_t a1, a2, v1, v2, prod; @@ -208,90 +330,67 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P v1 = vmul_f32(v1, a2); // Multiply the imag a with b v2 = vmul_f32(v2, a2); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64_f32(v2); // Add v1, v2 prod = vadd_f32(v1, v2); - vst1_f32((float *)&s, prod); + vst1_f32(reinterpret_cast(&s), prod); return s; } -template -struct palign_impl -{ - EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = vextq_f32(first.v, second.v, 2); - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) -template<> struct conj_helper +template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; + // TODO optimize it for NEON + Packet1cf res = pmul(a, pconj(b)); + Packet2f s, rev_s; -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + // this computes the norm + s = vmul_f32(b.v, b.v); + rev_s = vrev64_f32(s); + return Packet1cf(pdiv(res.v, vadd_f32(s, rev_s))); +} template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a,pconj(b)); Packet4f s, rev_s; // this computes the norm s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s, rev_s))); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) {} +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet1cf psqrt(const Packet1cf& a) { + return psqrt_complex(a); +} + +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); +} + //---------- double ---------- #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; #else const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; @@ -309,7 +408,8 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 0, size = 1, @@ -328,24 +428,50 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits +{ + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d as_real; + enum + { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ + /* here we really have to use unaligned loads :( */ + return ploadu(&from); +} -template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(padd(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) -{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(psub(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) +{ return Packet1cd(pnegate(a.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d v1, v2; - // Get the real values of a + // Get the real values of a v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); // Get the imag values of a v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); @@ -353,7 +479,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con v1 = vmulq_f64(v1, b.v); // Multiply the imag a with b v2 = vmulq_f64(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); // Swap real/imag elements in v2. v2 = preverse(v2); @@ -361,31 +487,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(vaddq_f64(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { - return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a)==re(b), im(a)==im(b)] + Packet2d eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a)==im(b), re(a)==re(b)] + Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq))); + // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cd(pand(eq, eq_swapped)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} +{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) +{ return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride) { Packet2d res = pset1(0.0); res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); @@ -393,17 +532,14 @@ template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Pack return Packet1cd(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) -{ - to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); -} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>( + std::complex* to, const Packet1cd& from, Index stride) +{ to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); } - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); - return res; } @@ -411,59 +547,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); Packet2d rev_s = preverse(s); @@ -471,9 +562,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} +{ return Packet1cd(preverse(Packet2d(x.v))); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -481,6 +570,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); kernel.packet[1].v = tmp; } + +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); +} + #endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/inst/include/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h new file mode 100644 index 00000000..3481f337 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -0,0 +1,183 @@ +namespace Eigen { +namespace internal { + +#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. +// Here we specialize gebp_traits to eliminate these register spills. +// See #2138. +template<> +struct gebp_traits + : gebp_traits +{ + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + // This volatile inline ASM both acts as a barrier to prevent reordering, + // as well as enforces strict register use. + asm volatile( + "vmla.f32 %q[r], %q[c], %q[alpha]" + : [r] "+w" (r) + : [c] "w" (c), + [alpha] "w" (alpha) + : ); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType&) const { + acc(a, b, c); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } +}; + +#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +#if EIGEN_ARCH_ARM64 + +template<> +struct gebp_traits + : gebp_traits +{ + typedef float RhsPacket; + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1q_f32(b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f32(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif + } +}; + + +template<> +struct gebp_traits + : gebp_traits +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } +}; + +#endif // EIGEN_ARCH_ARM64 + +} // namespace internal +} // namespace Eigen diff --git a/inst/include/Eigen/src/Core/arch/NEON/MathFunctions.h b/inst/include/Eigen/src/Core/arch/NEON/MathFunctions.h index 6bb05bb9..fa6615a8 100644 --- a/inst/include/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/inst/include/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -5,10 +5,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - #ifndef EIGEN_MATH_FUNCTIONS_NEON_H #define EIGEN_MATH_FUNCTIONS_NEON_H @@ -16,74 +12,62 @@ namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) -{ - Packet4f x = _x; - Packet4f tmp, fx; - - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - x = vminq_f32(x, p4f_exp_hi); - x = vmaxq_f32(x, p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - Packet4ui mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, p4f_cephes_exp_C1); - Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); - z = vmulq_f32(x, x); - y = vaddq_f32(y, p4f_cephes_exp_p1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, p4f_1); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, p4i_0x7f); - mm = vshlq_n_s32(mm, 23); - Packet4f pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp(const Packet2f& x) +{ return pexp_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) +{ return pexp_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog(const Packet2f& x) +{ return plog_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& x) +{ return plog_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin(const Packet2f& x) +{ return psin_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& x) +{ return psin_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos(const Packet2f& x) +{ return pcos_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& x) +{ return pcos_float(x); } + +// Hyperbolic Tangent function. +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh(const Packet2f& x) +{ return internal::generic_fast_tanh_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& x) +{ return internal::generic_fast_tanh_float(x); } + +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh) + +template <> +EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) { + Packet4f fexponent; + const Packet4bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); } +//---------- double ---------- + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& x) +{ return pexp_double(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d plog(const Packet2d& x) +{ return plog_double(x); } + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h b/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h index 3d5ed0d2..d2aeef43 100644 --- a/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h @@ -24,54 +24,118 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #if EIGEN_ARCH_ARM64 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #else -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif #endif -#if EIGEN_COMP_MSVC +#if EIGEN_COMP_MSVC_STRICT // In MSVC's arm_neon.h header file, all NEON vector types // are aliases to the same underlying type __n128. // We thus have to wrap them to make them different C++ types. // (See also bug 1428) +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4c; +typedef eigen_packet_wrapper Packet8c; +typedef eigen_packet_wrapper Packet16c; +typedef eigen_packet_wrapper Packet4uc; +typedef eigen_packet_wrapper Packet8uc; +typedef eigen_packet_wrapper Packet16uc; +typedef eigen_packet_wrapper Packet4s; +typedef eigen_packet_wrapper Packet8s; +typedef eigen_packet_wrapper Packet4us; +typedef eigen_packet_wrapper Packet8us; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2ui; +typedef eigen_packet_wrapper Packet4ui; +typedef eigen_packet_wrapper Packet2l; +typedef eigen_packet_wrapper Packet2ul; -template -struct eigen_packet_wrapper -{ - operator T&() { return m_val; } - operator const T&() const { return m_val; } - eigen_packet_wrapper() {} - eigen_packet_wrapper(const T &v) : m_val(v) {} - eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } +#else - T m_val; -}; -typedef eigen_packet_wrapper Packet2f; -typedef eigen_packet_wrapper Packet4f; -typedef eigen_packet_wrapper Packet4i; -typedef eigen_packet_wrapper Packet2i; -typedef eigen_packet_wrapper Packet4ui; +typedef float32x2_t Packet2f; +typedef float32x4_t Packet4f; +typedef eigen_packet_wrapper Packet4c; +typedef int8x8_t Packet8c; +typedef int8x16_t Packet16c; +typedef eigen_packet_wrapper Packet4uc; +typedef uint8x8_t Packet8uc; +typedef uint8x16_t Packet16uc; +typedef int16x4_t Packet4s; +typedef int16x8_t Packet8s; +typedef uint16x4_t Packet4us; +typedef uint16x8_t Packet8us; +typedef int32x2_t Packet2i; +typedef int32x4_t Packet4i; +typedef uint32x2_t Packet2ui; +typedef uint32x4_t Packet4ui; +typedef int64x2_t Packet2l; +typedef uint64x2_t Packet2ul; + +#endif // EIGEN_COMP_MSVC_STRICT + +EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ + const float* a = reinterpret_cast(&m); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + return res; +} -#else +// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave +// == false (i.e. shuffle(m, n, mask) equals _mm_shuffle_ps(m, n, mask)), +// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h +// to enable a shared implementation for fast inversion of matrices of size 4. +template +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} + +template<> +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} -typedef float32x2_t Packet2f; -typedef float32x4_t Packet4f; -typedef int32x4_t Packet4i; -typedef int32x2_t Packet2i; -typedef uint32x4_t Packet4ui; +EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));} -#endif // EIGEN_COMP_MSVC +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) +{ + return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); +} +#define vec4f_duplane(a, p) \ + vdupq_lane_f32(vget_low_f32(a), p) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -98,660 +162,4423 @@ typedef uint32x4_t Packet4ui; #define EIGEN_ARM_PREFETCH(ADDR) #endif -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4f type; - typedef Packet4f half; // Packet2f intrinsics not implemented yet +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet2f half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet16c type; + typedef Packet8c half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet16uc type; + typedef Packet8uc half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet8s type; + typedef Packet4s half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet8us type; + typedef Packet4us half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet2i half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4ui type; + typedef Packet2ui half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet2l type; + typedef Packet2l half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet2ul type; + typedef Packet2ul half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM +// workaround gcc 4.2, 4.3 and 4.4 compilation issue +EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } +EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } +#endif + +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet2f half; + typedef Packet2i integer_packet; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet2f half; + typedef Packet4i integer_packet; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet4c half; + enum + { + size = 4, + alignment = Unaligned, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet4c half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet8c half; + enum + { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet4uc half; + enum + { + size = 4, + alignment = Unaligned, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet4uc half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet8uc half; + enum + { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false}; +}; +template<> struct unpacket_traits +{ + typedef int16_t type; + typedef Packet4s half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int16_t type; + typedef Packet4s half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint16_t type; + typedef Packet4us half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint16_t type; + typedef Packet4us half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int32_t type; + typedef Packet2i half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int32_t type; + typedef Packet2i half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint32_t type; + typedef Packet2ui half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint32_t type; + typedef Packet2ui half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int64_t type; + typedef Packet2l half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint64_t type; + typedef Packet2ul half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet2f pset1(const float& from) { return vdup_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c pset1(const int8_t& from) +{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pset1(const int8_t& from) { return vdup_n_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c pset1(const int8_t& from) { return vdupq_n_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc pset1(const uint8_t& from) +{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc pset1(const uint8_t& from) { return vdup_n_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc pset1(const uint8_t& from) { return vdupq_n_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s pset1(const int16_t& from) { return vdup_n_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s pset1(const int16_t& from) { return vdupq_n_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us pset1(const uint16_t& from) { return vdup_n_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us pset1(const uint16_t& from) { return vdupq_n_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i pset1(const int32_t& from) { return vdup_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui pset1(const uint32_t& from) { return vdup_n_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui pset1(const uint32_t& from) { return vdupq_n_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l pset1(const int64_t& from) { return vdupq_n_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul pset1(const uint64_t& from) { return vdupq_n_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet2f pset1frombits(unsigned int from) +{ return vreinterpret_f32_u32(vdup_n_u32(from)); } +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) +{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); } + +template<> EIGEN_STRONG_INLINE Packet2f plset(const float& a) +{ + const float c[] = {0.0f,1.0f}; + return vadd_f32(pset1(a), vld1_f32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) +{ + const float c[] = {0.0f,1.0f,2.0f,3.0f}; + return vaddq_f32(pset1(a), vld1q_f32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4c plset(const int8_t& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c plset(const int8_t& a) +{ + const int8_t c[] = {0,1,2,3,4,5,6,7}; + return vadd_s8(pset1(a), vld1_s8(c)); +} +template<> EIGEN_STRONG_INLINE Packet16c plset(const int8_t& a) +{ + const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + return vaddq_s8(pset1(a), vld1q_s8(c)); +} +template<> EIGEN_STRONG_INLINE Packet4uc plset(const uint8_t& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc plset(const uint8_t& a) +{ + const uint8_t c[] = {0,1,2,3,4,5,6,7}; + return vadd_u8(pset1(a), vld1_u8(c)); +} +template<> EIGEN_STRONG_INLINE Packet16uc plset(const uint8_t& a) +{ + const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + return vaddq_u8(pset1(a), vld1q_u8(c)); +} +template<> EIGEN_STRONG_INLINE Packet4s plset(const int16_t& a) +{ + const int16_t c[] = {0,1,2,3}; + return vadd_s16(pset1(a), vld1_s16(c)); +} +template<> EIGEN_STRONG_INLINE Packet4us plset(const uint16_t& a) +{ + const uint16_t c[] = {0,1,2,3}; + return vadd_u16(pset1(a), vld1_u16(c)); +} +template<> EIGEN_STRONG_INLINE Packet8s plset(const int16_t& a) +{ + const int16_t c[] = {0,1,2,3,4,5,6,7}; + return vaddq_s16(pset1(a), vld1q_s16(c)); +} +template<> EIGEN_STRONG_INLINE Packet8us plset(const uint16_t& a) +{ + const uint16_t c[] = {0,1,2,3,4,5,6,7}; + return vaddq_u16(pset1(a), vld1q_u16(c)); +} +template<> EIGEN_STRONG_INLINE Packet2i plset(const int32_t& a) +{ + const int32_t c[] = {0,1}; + return vadd_s32(pset1(a), vld1_s32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) +{ + const int32_t c[] = {0,1,2,3}; + return vaddq_s32(pset1(a), vld1q_s32(c)); +} +template<> EIGEN_STRONG_INLINE Packet2ui plset(const uint32_t& a) +{ + const uint32_t c[] = {0,1}; + return vadd_u32(pset1(a), vld1_u32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4ui plset(const uint32_t& a) +{ + const uint32_t c[] = {0,1,2,3}; + return vaddq_u32(pset1(a), vld1q_u32(c)); +} +template<> EIGEN_STRONG_INLINE Packet2l plset(const int64_t& a) +{ + const int64_t c[] = {0,1}; + return vaddq_s64(pset1(a), vld1q_s64(c)); +} +template<> EIGEN_STRONG_INLINE Packet2ul plset(const uint64_t& a) +{ + const uint64_t c[] = {0,1}; + return vaddq_u64(pset1(a), vld1q_u64(c)); +} + +template<> EIGEN_STRONG_INLINE Packet2f padd(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c padd(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vadd_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c padd(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c padd(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc padd(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vadd_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc padd(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s padd(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us padd(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us padd(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i padd(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui padd(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui padd(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l padd(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul padd(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f psub(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c psub(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vsub_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c psub(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c psub(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc psub(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vsub_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc psub(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s psub(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us psub(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us psub(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i psub(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui psub(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui psub(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l psub(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); +template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { + Packet2f mask = {numext::bit_cast(0x80000000u), 0.0f}; + return padd(a, pxor(mask, b)); +} +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { + Packet4f mask = {numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f}; + return padd(a, pxor(mask, b)); +} + +template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); } +template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) { +#if EIGEN_ARCH_ARM64 + return vnegq_s64(a); +#else + return vcombine_s64( + vdup_n_s64(-vgetq_lane_s64(a, 0)), + vdup_n_s64(-vgetq_lane_s64(a, 1))); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2f pmul(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c pmul(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmul_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmul(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmul(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmul(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmul_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmul(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmul(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmul(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmul(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmul(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmul(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmul(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmul(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)), + vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmul(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)), + vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) +{ +#if EIGEN_ARCH_ARM64 + return vdiv_f32(a,b); +#else + Packet2f inv, restep, div; + + // NEON does not offer a divide instruction, we have to do a reciprocal approximation + // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers + // a reciprocal estimate AND a reciprocal step -which saves a few instructions + // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with + // Newton-Raphson and vrecpsq_f32() + inv = vrecpe_f32(b); + + // This returns a differential, by which we will have to multiply inv to get a better + // approximation of 1/b. + restep = vrecps_f32(b, inv); + inv = vmul_f32(restep, inv); + + // Finally, multiply a by 1/b and get the wanted result of the division. + div = vmul_f32(a, inv); + + return div; +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ +#if EIGEN_ARCH_ARM64 + return vdivq_f32(a,b); +#else + Packet4f inv, restep, div; + + // NEON does not offer a divide instruction, we have to do a reciprocal approximation + // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers + // a reciprocal estimate AND a reciprocal step -which saves a few instructions + // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with + // Newton-Raphson and vrecpsq_f32() + inv = vrecpeq_f32(b); + + // This returns a differential, by which we will have to multiply inv to get a better + // approximation of 1/b. + restep = vrecpsq_f32(b, inv); + inv = vmulq_f32(restep, inv); + + // Finally, multiply a by 1/b and get the wanted result of the division. + div = vmulq_f32(a, inv); + + return div; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4c pdiv(const Packet4c& /*a*/, const Packet4c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8c pdiv(const Packet8c& /*a*/, const Packet8c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet16c pdiv(const Packet16c& /*a*/, const Packet16c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4uc pdiv(const Packet4uc& /*a*/, const Packet4uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pdiv(const Packet8uc& /*a*/, const Packet8uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet16uc pdiv(const Packet16uc& /*a*/, const Packet16uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4s pdiv(const Packet4s& /*a*/, const Packet4s& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8s pdiv(const Packet8s& /*a*/, const Packet8s& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4us pdiv(const Packet4us& /*a*/, const Packet4us& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8us pdiv(const Packet8us& /*a*/, const Packet8us& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2i pdiv(const Packet2i& /*a*/, const Packet2i& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2ui pdiv(const Packet2ui& /*a*/, const Packet2ui& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4ui pdiv(const Packet4ui& /*a*/, const Packet4ui& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2l pdiv(const Packet2l& /*a*/, const Packet2l& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0LL); +} +template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, const Packet2ul& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0ULL); +} + + +#ifdef __ARM_FEATURE_FMA +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ return vfmaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) +{ return vfma_f32(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + return vmlaq_f32(c,a,b); +} +template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) +{ + return vmla_f32(c,a,b); +} +#endif + +// No FMA instruction for int, so use MLA unconditionally. +template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmla_s8( + vreinterpret_s8_s32(vdup_n_s32(c)), + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) +{ return vmla_s8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) +{ return vmlaq_s8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmla_u8( + vreinterpret_u8_u32(vdup_n_u32(c)), + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) +{ return vmla_u8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) +{ return vmlaq_u8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) +{ return vmla_s16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) +{ return vmlaq_s16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) +{ return vmla_u16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) +{ return vmlaq_u16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) +{ return vmla_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) +{ return vmlaq_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) +{ return vmla_u32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) +{ return vmlaq_u32(c,a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pabsdiff(const Packet2f& a, const Packet2f& b) +{ return vabd_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pabsdiff(const Packet4f& a, const Packet4f& b) +{ return vabdq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c pabsdiff(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vabd_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pabsdiff(const Packet8c& a, const Packet8c& b) +{ return vabd_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pabsdiff(const Packet16c& a, const Packet16c& b) +{ return vabdq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vabd_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff(const Packet8uc& a, const Packet8uc& b) +{ return vabd_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff(const Packet16uc& a, const Packet16uc& b) +{ return vabdq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pabsdiff(const Packet4s& a, const Packet4s& b) +{ return vabd_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pabsdiff(const Packet8s& a, const Packet8s& b) +{ return vabdq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pabsdiff(const Packet4us& a, const Packet4us& b) +{ return vabd_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pabsdiff(const Packet8us& a, const Packet8us& b) +{ return vabdq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pabsdiff(const Packet2i& a, const Packet2i& b) +{ return vabd_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pabsdiff(const Packet4i& a, const Packet4i& b) +{ return vabdq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff(const Packet2ui& a, const Packet2ui& b) +{ return vabd_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff(const Packet4ui& a, const Packet4ui& b) +{ return vabdq_u32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); } +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4c pmin(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmin_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmin(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmin(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmin_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmin(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmin(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmin(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmin(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmin(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmin(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmin(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), + vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmin(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), + vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); } +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return pmax(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return pmax(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4c pmax(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmax_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmax(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmax(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmax_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmax(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmax(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmax(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmax(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmax(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmax(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmax(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), + vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmax(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), + vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_le(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vcle_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_le(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vcle_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_le(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vcle_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vcle_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le(const Packet8uc& a, const Packet8uc& b) +{ return vcle_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) +{ return vcleq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_le(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vcle_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_le(const Packet4us& a, const Packet4us& b) +{ return vcle_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) +{ return vcleq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_le(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vcle_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le(const Packet2ui& a, const Packet2ui& b) +{ return vcle_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) +{ return vcleq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vcleq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vcleq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vclt_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vclt_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vclt_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vclt_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt(const Packet8uc& a, const Packet8uc& b) +{ return vclt_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) +{ return vcltq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vclt_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt(const Packet4us& a, const Packet4us& b) +{ return vclt_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) +{ return vcltq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vclt_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt(const Packet2ui& a, const Packet2ui& b) +{ return vclt_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) +{ return vcltq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vcltq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vcltq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vceq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vceq_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vceq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vceq_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq(const Packet8uc& a, const Packet8uc& b) +{ return vceq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) +{ return vceqq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vceq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq(const Packet4us& a, const Packet4us& b) +{ return vceq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) +{ return vceqq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vceq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq(const Packet2ui& a, const Packet2ui& b) +{ return vceq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) +{ return vceqq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vceqq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vceqq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2f pand(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pand(const Packet4c& a, const Packet4c& b) +{ return a & b; } +template<> EIGEN_STRONG_INLINE Packet8c pand(const Packet8c& a, const Packet8c& b) +{ return vand_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pand(const Packet16c& a, const Packet16c& b) +{ return vandq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pand(const Packet4uc& a, const Packet4uc& b) +{ return a & b; } +template<> EIGEN_STRONG_INLINE Packet8uc pand(const Packet8uc& a, const Packet8uc& b) +{ return vand_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pand(const Packet16uc& a, const Packet16uc& b) +{ return vandq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pand(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pand(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pand(const Packet4us& a, const Packet4us& b) +{ return vand_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pand(const Packet8us& a, const Packet8us& b) +{ return vandq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pand(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pand(const Packet2ui& a, const Packet2ui& b) +{ return vand_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pand(const Packet4ui& a, const Packet4ui& b) +{ return vandq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pand(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pand(const Packet2ul& a, const Packet2ul& b) +{ return vandq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f por(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c por(const Packet4c& a, const Packet4c& b) +{ return a | b; } +template<> EIGEN_STRONG_INLINE Packet8c por(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c por(const Packet16c& a, const Packet16c& b) +{ return vorrq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc por(const Packet4uc& a, const Packet4uc& b) +{ return a | b; } +template<> EIGEN_STRONG_INLINE Packet8uc por(const Packet8uc& a, const Packet8uc& b) +{ return vorr_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc por(const Packet16uc& a, const Packet16uc& b) +{ return vorrq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s por(const Packet4s& a, const Packet4s& b) +{ return vorr_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s por(const Packet8s& a, const Packet8s& b) +{ return vorrq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us por(const Packet4us& a, const Packet4us& b) +{ return vorr_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us por(const Packet8us& a, const Packet8us& b) +{ return vorrq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i por(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui por(const Packet2ui& a, const Packet2ui& b) +{ return vorr_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui por(const Packet4ui& a, const Packet4ui& b) +{ return vorrq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l por(const Packet2l& a, const Packet2l& b) +{ return vorrq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul por(const Packet2ul& a, const Packet2ul& b) +{ return vorrq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pxor(const Packet4c& a, const Packet4c& b) +{ return a ^ b; } +template<> EIGEN_STRONG_INLINE Packet8c pxor(const Packet8c& a, const Packet8c& b) +{ return veor_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pxor(const Packet16c& a, const Packet16c& b) +{ return veorq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pxor(const Packet4uc& a, const Packet4uc& b) +{ return a ^ b; } +template<> EIGEN_STRONG_INLINE Packet8uc pxor(const Packet8uc& a, const Packet8uc& b) +{ return veor_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pxor(const Packet16uc& a, const Packet16uc& b) +{ return veorq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pxor(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pxor(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pxor(const Packet4us& a, const Packet4us& b) +{ return veor_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pxor(const Packet8us& a, const Packet8us& b) +{ return veorq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pxor(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pxor(const Packet2ui& a, const Packet2ui& b) +{ return veor_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pxor(const Packet4ui& a, const Packet4ui& b) +{ return veorq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pxor(const Packet2l& a, const Packet2l& b) +{ return veorq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pxor(const Packet2ul& a, const Packet2ul& b) +{ return veorq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pandnot(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pandnot(const Packet4c& a, const Packet4c& b) +{ return a & ~b; } +template<> EIGEN_STRONG_INLINE Packet8c pandnot(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pandnot(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pandnot(const Packet4uc& a, const Packet4uc& b) +{ return a & ~b; } +template<> EIGEN_STRONG_INLINE Packet8uc pandnot(const Packet8uc& a, const Packet8uc& b) +{ return vbic_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pandnot(const Packet16uc& a, const Packet16uc& b) +{ return vbicq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pandnot(const Packet4s& a, const Packet4s& b) +{ return vbic_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pandnot(const Packet8s& a, const Packet8s& b) +{ return vbicq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pandnot(const Packet4us& a, const Packet4us& b) +{ return vbic_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pandnot(const Packet8us& a, const Packet8us& b) +{ return vbicq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pandnot(const Packet2i& a, const Packet2i& b) +{ return vbic_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) +{ return vbicq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pandnot(const Packet2ui& a, const Packet2ui& b) +{ return vbic_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pandnot(const Packet4ui& a, const Packet4ui& b) +{ return vbicq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pandnot(const Packet2l& a, const Packet2l& b) +{ return vbicq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pandnot(const Packet2ul& a, const Packet2ul& b) +{ return vbicq_u64(a,b); } + + +template EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); } +template EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } + +template EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) +{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); } +template EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) +{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); } +template EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) +{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); } +template EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) +{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); } +template EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) +{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) +{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); } +template EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) +{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); } +template EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } + +template EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); } +template EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); } + +template<> EIGEN_STRONG_INLINE Packet2f pload(const float* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) +{ + Packet4c res; + memcpy(&res, from, sizeof(Packet4c)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8c pload(const int8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) +{ + Packet4uc res; + memcpy(&res, from, sizeof(Packet4uc)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8uc pload(const uint8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s pload(const int16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us pload(const uint16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i pload(const int32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui pload(const uint32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet2f ploadu(const float* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c ploadu(const int8_t* from) +{ + Packet4c res; + memcpy(&res, from, sizeof(Packet4c)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8c ploadu(const int8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c ploadu(const int8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc ploadu(const uint8_t* from) +{ + Packet4uc res; + memcpy(&res, from, sizeof(Packet4uc)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8uc ploadu(const uint8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const uint8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s ploadu(const int16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s ploadu(const int16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us ploadu(const uint16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us ploadu(const uint16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i ploadu(const int32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui ploadu(const uint32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploadu(const uint32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l ploadu(const int64_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul ploadu(const uint64_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet2f ploaddup(const float* from) +{ return vld1_dup_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet4c ploaddup(const int8_t* from) +{ + const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload(from))); + return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c ploaddup(const int8_t* from) +{ + const int8x8_t a = vld1_s8(from); + return vzip_s8(a,a).val[0]; +} +template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const int8_t* from) +{ + const int8x8_t a = vld1_s8(from); + const int8x8x2_t b = vzip_s8(a,a); + return vcombine_s8(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload(from))); + return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vld1_u8(from); + return vzip_u8(a,a).val[0]; +} +template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vld1_u8(from); + const uint8x8x2_t b = vzip_u8(a,a); + return vcombine_u8(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4s ploaddup(const int16_t* from) +{ + return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), + vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const int16_t* from) +{ + const int16x4_t a = vld1_s16(from); + const int16x4x2_t b = vzip_s16(a,a); + return vcombine_s16(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4us ploaddup(const uint16_t* from) +{ + return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), + vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const uint16_t* from) +{ + const uint16x4_t a = vld1_u16(from); + const uint16x4x2_t b = vzip_u16(a,a); + return vcombine_u16(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet2i ploaddup(const int32_t* from) +{ return vld1_dup_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) +{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet2ui ploaddup(const uint32_t* from) +{ return vld1_dup_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploaddup(const uint32_t* from) +{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet2l ploaddup(const int64_t* from) +{ return vld1q_dup_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul ploaddup(const uint64_t* from) +{ return vld1q_dup_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet4f ploadquad(const float* from) { return vld1q_dup_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c ploadquad(const int8_t* from) +{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8c ploadquad(const int8_t* from) +{ + return vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from)), + vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet16c ploadquad(const int8_t* from) +{ + const int8x8_t a = vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from)), + vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); + const int8x8_t b = vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from+2)), + vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]); + return vcombine_s8(a,b); +} +template<> EIGEN_STRONG_INLINE Packet4uc ploadquad(const uint8_t* from) +{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc ploadquad(const uint8_t* from) +{ + return vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from)), + vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet16uc ploadquad(const uint8_t* from) +{ + const uint8x8_t a = vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from)), + vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); + const uint8x8_t b = vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from+2)), + vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]); + return vcombine_u8(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const int16_t* from) +{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); } +template<> EIGEN_STRONG_INLINE Packet8us ploadquad(const uint16_t* from) +{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); } +template<> EIGEN_STRONG_INLINE Packet4i ploadquad(const int32_t* from) { return vld1q_dup_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { return vld1q_dup_u32(from); } + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet2f& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet8c& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet8uc& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet4s& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet4us& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet2i& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet2ui& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet2f& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet4c& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet8c& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet4uc& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet8uc& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet4s& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet8s& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet4us& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet8us& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet2i& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet2ui& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet4ui& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet2l& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint64_t* to, const Packet2ul& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather(const float* from, Index stride) +{ + Packet2f res = vld1_dup_f32(from); + res = vld1_lane_f32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather(const float* from, Index stride) +{ + Packet4f res = vld1q_dup_f32(from); + res = vld1q_lane_f32(from + 1*stride, res, 1); + res = vld1q_lane_f32(from + 2*stride, res, 2); + res = vld1q_lane_f32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather(const int8_t* from, Index stride) +{ + Packet4c res; + for (int i = 0; i != 4; i++) + reinterpret_cast(&res)[i] = *(from + i * stride); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather(const int8_t* from, Index stride) +{ + Packet8c res = vld1_dup_s8(from); + res = vld1_lane_s8(from + 1*stride, res, 1); + res = vld1_lane_s8(from + 2*stride, res, 2); + res = vld1_lane_s8(from + 3*stride, res, 3); + res = vld1_lane_s8(from + 4*stride, res, 4); + res = vld1_lane_s8(from + 5*stride, res, 5); + res = vld1_lane_s8(from + 6*stride, res, 6); + res = vld1_lane_s8(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather(const int8_t* from, Index stride) +{ + Packet16c res = vld1q_dup_s8(from); + res = vld1q_lane_s8(from + 1*stride, res, 1); + res = vld1q_lane_s8(from + 2*stride, res, 2); + res = vld1q_lane_s8(from + 3*stride, res, 3); + res = vld1q_lane_s8(from + 4*stride, res, 4); + res = vld1q_lane_s8(from + 5*stride, res, 5); + res = vld1q_lane_s8(from + 6*stride, res, 6); + res = vld1q_lane_s8(from + 7*stride, res, 7); + res = vld1q_lane_s8(from + 8*stride, res, 8); + res = vld1q_lane_s8(from + 9*stride, res, 9); + res = vld1q_lane_s8(from + 10*stride, res, 10); + res = vld1q_lane_s8(from + 11*stride, res, 11); + res = vld1q_lane_s8(from + 12*stride, res, 12); + res = vld1q_lane_s8(from + 13*stride, res, 13); + res = vld1q_lane_s8(from + 14*stride, res, 14); + res = vld1q_lane_s8(from + 15*stride, res, 15); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather(const uint8_t* from, Index stride) +{ + Packet4uc res; + for (int i = 0; i != 4; i++) + reinterpret_cast(&res)[i] = *(from + i * stride); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather(const uint8_t* from, Index stride) +{ + Packet8uc res = vld1_dup_u8(from); + res = vld1_lane_u8(from + 1*stride, res, 1); + res = vld1_lane_u8(from + 2*stride, res, 2); + res = vld1_lane_u8(from + 3*stride, res, 3); + res = vld1_lane_u8(from + 4*stride, res, 4); + res = vld1_lane_u8(from + 5*stride, res, 5); + res = vld1_lane_u8(from + 6*stride, res, 6); + res = vld1_lane_u8(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather(const uint8_t* from, Index stride) +{ + Packet16uc res = vld1q_dup_u8(from); + res = vld1q_lane_u8(from + 1*stride, res, 1); + res = vld1q_lane_u8(from + 2*stride, res, 2); + res = vld1q_lane_u8(from + 3*stride, res, 3); + res = vld1q_lane_u8(from + 4*stride, res, 4); + res = vld1q_lane_u8(from + 5*stride, res, 5); + res = vld1q_lane_u8(from + 6*stride, res, 6); + res = vld1q_lane_u8(from + 7*stride, res, 7); + res = vld1q_lane_u8(from + 8*stride, res, 8); + res = vld1q_lane_u8(from + 9*stride, res, 9); + res = vld1q_lane_u8(from + 10*stride, res, 10); + res = vld1q_lane_u8(from + 11*stride, res, 11); + res = vld1q_lane_u8(from + 12*stride, res, 12); + res = vld1q_lane_u8(from + 13*stride, res, 13); + res = vld1q_lane_u8(from + 14*stride, res, 14); + res = vld1q_lane_u8(from + 15*stride, res, 15); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather(const int16_t* from, Index stride) +{ + Packet4s res = vld1_dup_s16(from); + res = vld1_lane_s16(from + 1*stride, res, 1); + res = vld1_lane_s16(from + 2*stride, res, 2); + res = vld1_lane_s16(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather(const int16_t* from, Index stride) +{ + Packet8s res = vld1q_dup_s16(from); + res = vld1q_lane_s16(from + 1*stride, res, 1); + res = vld1q_lane_s16(from + 2*stride, res, 2); + res = vld1q_lane_s16(from + 3*stride, res, 3); + res = vld1q_lane_s16(from + 4*stride, res, 4); + res = vld1q_lane_s16(from + 5*stride, res, 5); + res = vld1q_lane_s16(from + 6*stride, res, 6); + res = vld1q_lane_s16(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather(const uint16_t* from, Index stride) +{ + Packet4us res = vld1_dup_u16(from); + res = vld1_lane_u16(from + 1*stride, res, 1); + res = vld1_lane_u16(from + 2*stride, res, 2); + res = vld1_lane_u16(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather(const uint16_t* from, Index stride) +{ + Packet8us res = vld1q_dup_u16(from); + res = vld1q_lane_u16(from + 1*stride, res, 1); + res = vld1q_lane_u16(from + 2*stride, res, 2); + res = vld1q_lane_u16(from + 3*stride, res, 3); + res = vld1q_lane_u16(from + 4*stride, res, 4); + res = vld1q_lane_u16(from + 5*stride, res, 5); + res = vld1q_lane_u16(from + 6*stride, res, 6); + res = vld1q_lane_u16(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather(const int32_t* from, Index stride) +{ + Packet2i res = vld1_dup_s32(from); + res = vld1_lane_s32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather(const int32_t* from, Index stride) +{ + Packet4i res = vld1q_dup_s32(from); + res = vld1q_lane_s32(from + 1*stride, res, 1); + res = vld1q_lane_s32(from + 2*stride, res, 2); + res = vld1q_lane_s32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather(const uint32_t* from, Index stride) +{ + Packet2ui res = vld1_dup_u32(from); + res = vld1_lane_u32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather(const uint32_t* from, Index stride) +{ + Packet4ui res = vld1q_dup_u32(from); + res = vld1q_lane_u32(from + 1*stride, res, 1); + res = vld1q_lane_u32(from + 2*stride, res, 2); + res = vld1q_lane_u32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather(const int64_t* from, Index stride) +{ + Packet2l res = vld1q_dup_s64(from); + res = vld1q_lane_s64(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather(const uint64_t* from, Index stride) +{ + Packet2ul res = vld1q_dup_u64(from); + res = vld1q_lane_u64(from + 1*stride, res, 1); + return res; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(float* to, const Packet2f& from, Index stride) +{ + vst1_lane_f32(to + stride*0, from, 0); + vst1_lane_f32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(float* to, const Packet4f& from, Index stride) +{ + vst1q_lane_f32(to + stride*0, from, 0); + vst1q_lane_f32(to + stride*1, from, 1); + vst1q_lane_f32(to + stride*2, from, 2); + vst1q_lane_f32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet4c& from, Index stride) +{ + for (int i = 0; i != 4; i++) + *(to + i * stride) = reinterpret_cast(&from)[i]; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet8c& from, Index stride) +{ + vst1_lane_s8(to + stride*0, from, 0); + vst1_lane_s8(to + stride*1, from, 1); + vst1_lane_s8(to + stride*2, from, 2); + vst1_lane_s8(to + stride*3, from, 3); + vst1_lane_s8(to + stride*4, from, 4); + vst1_lane_s8(to + stride*5, from, 5); + vst1_lane_s8(to + stride*6, from, 6); + vst1_lane_s8(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet16c& from, Index stride) +{ + vst1q_lane_s8(to + stride*0, from, 0); + vst1q_lane_s8(to + stride*1, from, 1); + vst1q_lane_s8(to + stride*2, from, 2); + vst1q_lane_s8(to + stride*3, from, 3); + vst1q_lane_s8(to + stride*4, from, 4); + vst1q_lane_s8(to + stride*5, from, 5); + vst1q_lane_s8(to + stride*6, from, 6); + vst1q_lane_s8(to + stride*7, from, 7); + vst1q_lane_s8(to + stride*8, from, 8); + vst1q_lane_s8(to + stride*9, from, 9); + vst1q_lane_s8(to + stride*10, from, 10); + vst1q_lane_s8(to + stride*11, from, 11); + vst1q_lane_s8(to + stride*12, from, 12); + vst1q_lane_s8(to + stride*13, from, 13); + vst1q_lane_s8(to + stride*14, from, 14); + vst1q_lane_s8(to + stride*15, from, 15); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet4uc& from, Index stride) +{ + for (int i = 0; i != 4; i++) + *(to + i * stride) = reinterpret_cast(&from)[i]; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet8uc& from, Index stride) +{ + vst1_lane_u8(to + stride*0, from, 0); + vst1_lane_u8(to + stride*1, from, 1); + vst1_lane_u8(to + stride*2, from, 2); + vst1_lane_u8(to + stride*3, from, 3); + vst1_lane_u8(to + stride*4, from, 4); + vst1_lane_u8(to + stride*5, from, 5); + vst1_lane_u8(to + stride*6, from, 6); + vst1_lane_u8(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet16uc& from, Index stride) +{ + vst1q_lane_u8(to + stride*0, from, 0); + vst1q_lane_u8(to + stride*1, from, 1); + vst1q_lane_u8(to + stride*2, from, 2); + vst1q_lane_u8(to + stride*3, from, 3); + vst1q_lane_u8(to + stride*4, from, 4); + vst1q_lane_u8(to + stride*5, from, 5); + vst1q_lane_u8(to + stride*6, from, 6); + vst1q_lane_u8(to + stride*7, from, 7); + vst1q_lane_u8(to + stride*8, from, 8); + vst1q_lane_u8(to + stride*9, from, 9); + vst1q_lane_u8(to + stride*10, from, 10); + vst1q_lane_u8(to + stride*11, from, 11); + vst1q_lane_u8(to + stride*12, from, 12); + vst1q_lane_u8(to + stride*13, from, 13); + vst1q_lane_u8(to + stride*14, from, 14); + vst1q_lane_u8(to + stride*15, from, 15); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int16_t* to, const Packet4s& from, Index stride) +{ + vst1_lane_s16(to + stride*0, from, 0); + vst1_lane_s16(to + stride*1, from, 1); + vst1_lane_s16(to + stride*2, from, 2); + vst1_lane_s16(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int16_t* to, const Packet8s& from, Index stride) +{ + vst1q_lane_s16(to + stride*0, from, 0); + vst1q_lane_s16(to + stride*1, from, 1); + vst1q_lane_s16(to + stride*2, from, 2); + vst1q_lane_s16(to + stride*3, from, 3); + vst1q_lane_s16(to + stride*4, from, 4); + vst1q_lane_s16(to + stride*5, from, 5); + vst1q_lane_s16(to + stride*6, from, 6); + vst1q_lane_s16(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint16_t* to, const Packet4us& from, Index stride) +{ + vst1_lane_u16(to + stride*0, from, 0); + vst1_lane_u16(to + stride*1, from, 1); + vst1_lane_u16(to + stride*2, from, 2); + vst1_lane_u16(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint16_t* to, const Packet8us& from, Index stride) +{ + vst1q_lane_u16(to + stride*0, from, 0); + vst1q_lane_u16(to + stride*1, from, 1); + vst1q_lane_u16(to + stride*2, from, 2); + vst1q_lane_u16(to + stride*3, from, 3); + vst1q_lane_u16(to + stride*4, from, 4); + vst1q_lane_u16(to + stride*5, from, 5); + vst1q_lane_u16(to + stride*6, from, 6); + vst1q_lane_u16(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int32_t* to, const Packet2i& from, Index stride) +{ + vst1_lane_s32(to + stride*0, from, 0); + vst1_lane_s32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int32_t* to, const Packet4i& from, Index stride) +{ + vst1q_lane_s32(to + stride*0, from, 0); + vst1q_lane_s32(to + stride*1, from, 1); + vst1q_lane_s32(to + stride*2, from, 2); + vst1q_lane_s32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet2ui& from, Index stride) +{ + vst1_lane_u32(to + stride*0, from, 0); + vst1_lane_u32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet4ui& from, Index stride) +{ + vst1q_lane_u32(to + stride*0, from, 0); + vst1q_lane_u32(to + stride*1, from, 1); + vst1q_lane_u32(to + stride*2, from, 2); + vst1q_lane_u32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int64_t* to, const Packet2l& from, Index stride) +{ + vst1q_lane_s64(to + stride*0, from, 0); + vst1q_lane_s64(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint64_t* to, const Packet2ul& from, Index stride) +{ + vst1q_lane_u64(to + stride*0, from, 0); + vst1q_lane_u64(to + stride*1, from, 1); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet2f& a) { return vget_lane_f32(a,0); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return vgetq_lane_f32(a,0); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet4c& a) { return static_cast(a & 0xff); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet8c& a) { return vget_lane_s8(a,0); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet16c& a) { return vgetq_lane_s8(a,0); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet4uc& a) { return static_cast(a & 0xff); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet8uc& a) { return vget_lane_u8(a,0); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet16uc& a) { return vgetq_lane_u8(a,0); } +template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet4s& a) { return vget_lane_s16(a,0); } +template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet8s& a) { return vgetq_lane_s16(a,0); } +template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet4us& a) { return vget_lane_u16(a,0); } +template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet8us& a) { return vgetq_lane_u16(a,0); } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet2i& a) { return vget_lane_s32(a,0); } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { return vgetq_lane_s32(a,0); } +template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet2ui& a) { return vget_lane_u32(a,0); } +template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { return vgetq_lane_u32(a,0); } +template<> EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { return vgetq_lane_s64(a,0); } +template<> EIGEN_STRONG_INLINE uint64_t pfirst(const Packet2ul& a) { return vgetq_lane_u64(a,0); } + +template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + const float32x4_t a_r64 = vrev64q_f32(a); + return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) +{ + const int8x16_t a_r64 = vrev64q_s8(a); + return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); } +template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) +{ + const uint8x16_t a_r64 = vrev64q_u8(a); + return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) +{ + const int16x8_t a_r64 = vrev64q_s16(a); + return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); } +template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) +{ + const uint16x8_t a_r64 = vrev64q_u16(a); + return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + const int32x4_t a_r64 = vrev64q_s32(a); + return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); } +template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) +{ + const uint32x4_t a_r64 = vrev64q_u32(a); + return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) +{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); } +template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) +{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); } + +template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4c pabs(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); } +template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { +#if EIGEN_ARCH_ARM64 + return vabsq_s64(a); +#else + return vcombine_s64( + vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), + vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1)))); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2f pfrexp(const Packet2f& a, Packet2f& exponent) +{ return pfrexp_generic(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) +{ return pfrexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE Packet2f pldexp(const Packet2f& a, const Packet2f& exponent) +{ return pldexp_generic(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) +{ return pldexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE float predux(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); } +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpadd_f32(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t sum = vpadd_s8(a_dup, a_dup); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet8c& a) +{ + int8x8_t sum = vpadd_s8(a,a); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) +{ + int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a)); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t sum = vpadd_u8(a_dup, a_dup); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet8uc& a) +{ + uint8x8_t sum = vpadd_u8(a,a); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) +{ + uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a)); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet4s& a) +{ + const int16x4_t sum = vpadd_s16(a,a); + return vget_lane_s16(vpadd_s16(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet8s& a) +{ + int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a)); + sum = vpadd_s16(sum, sum); + sum = vpadd_s16(sum, sum); + return vget_lane_s16(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet4us& a) +{ + const uint16x4_t sum = vpadd_u16(a,a); + return vget_lane_u16(vpadd_u16(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet8us& a) +{ + uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a)); + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + return vget_lane_u16(sum, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); } +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) +{ + const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpadd_s32(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) +{ + const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpadd_u32(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) +{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); } +template<> EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) +{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) +{ + return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, + vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) +{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) +{ + return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, + vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) +{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) +{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) +{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); } + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) +{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) +{ + int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); + prod = vmul_s8(prod, vrev16_s8(prod)); + return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2); +} +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) +{ + int8x8_t prod = vmul_s8(a, vrev16_s8(a)); + prod = vmul_s8(prod, vrev32_s8(prod)); + return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); +} +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) +{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) +{ + uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); + prod = vmul_u8(prod, vrev16_u8(prod)); + return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) +{ + uint8x8_t prod = vmul_u8(a, vrev16_u8(a)); + prod = vmul_u8(prod, vrev32_u8(prod)); + return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) +{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } +template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) +{ + const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); + return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); +} +template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet8s& a) +{ + int16x4_t prod; + + // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| + prod = vmul_s16(vget_low_s16(a), vget_high_s16(a)); + // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| + prod = vmul_s16(prod, vrev32_s16(prod)); + // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| + return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet4us& a) +{ + const uint16x4_t prod = vmul_u16(a, vrev32_u16(a)); + return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a) +{ + uint16x4_t prod; + + // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| + prod = vmul_u16(vget_low_u16(a), vget_high_u16(a)); + // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| + prod = vmul_u16(prod, vrev32_u16(prod)); + // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| + return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); +} +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) +{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) +{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } +template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) +{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } +template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) +{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } +template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) +{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } +template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) +{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); } + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet2f& a) +{ return vget_lane_f32(vpmin_f32(a,a), 0); } +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmin_f32(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t min = vpmin_s8(a_dup, a_dup); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet8c& a) +{ + int8x8_t min = vpmin_s8(a,a); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) +{ + int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a)); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t min = vpmin_u8(a_dup, a_dup); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet8uc& a) +{ + uint8x8_t min = vpmin_u8(a,a); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& a) +{ + uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a)); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet4s& a) +{ + const int16x4_t min = vpmin_s16(a,a); + return vget_lane_s16(vpmin_s16(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet8s& a) +{ + int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a)); + min = vpmin_s16(min, min); + min = vpmin_s16(min, min); + return vget_lane_s16(min, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet4us& a) +{ + const uint16x4_t min = vpmin_u16(a,a); + return vget_lane_u16(vpmin_u16(min, min), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet8us& a) +{ + uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a)); + min = vpmin_u16(min, min); + min = vpmin_u16(min, min); + return vget_lane_u16(min, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet2i& a) +{ return vget_lane_s32(vpmin_s32(a,a), 0); } +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) +{ + const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmin_s32(min, min), 0); +} +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet2ui& a) +{ return vget_lane_u32(vpmin_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) +{ + const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpmin_u32(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux_min(const Packet2l& a) +{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } +template<> EIGEN_STRONG_INLINE uint64_t predux_min(const Packet2ul& a) +{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet2f& a) +{ return vget_lane_f32(vpmax_f32(a,a), 0); } +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmax_f32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t max = vpmax_s8(a_dup, a_dup); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet8c& a) +{ + int8x8_t max = vpmax_s8(a,a); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) +{ + int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a)); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t max = vpmax_u8(a_dup, a_dup); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet8uc& a) +{ + uint8x8_t max = vpmax_u8(a,a); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet16uc& a) +{ + uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a)); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet4s& a) +{ + const int16x4_t max = vpmax_s16(a,a); + return vget_lane_s16(vpmax_s16(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet8s& a) +{ + int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a)); + max = vpmax_s16(max, max); + max = vpmax_s16(max, max); + return vget_lane_s16(max, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet4us& a) +{ + const uint16x4_t max = vpmax_u16(a,a); + return vget_lane_u16(vpmax_u16(max, max), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet8us& a) +{ + uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a)); + max = vpmax_u16(max, max); + max = vpmax_u16(max, max); + return vget_lane_u16(max, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet2i& a) +{ return vget_lane_s32(vpmax_s32(a,a), 0); } +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) +{ + const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmax_s32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet2ui& a) +{ return vget_lane_u32(vpmax_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) +{ + const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpmax_u32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux_max(const Packet2l& a) +{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } +template<> EIGEN_STRONG_INLINE uint64_t predux_max(const Packet2ul& a) +{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), + vget_high_u32(vreinterpretq_u32_f32(x))); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0); +} + +// Helpers for ptranspose. +namespace detail { + +template +void zip_in_place(Packet& p1, Packet& p2); + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2f& p1, Packet2f& p2) { + const float32x2x2_t tmp = vzip_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4f& p1, Packet4f& p2) { + const float32x4x2_t tmp = vzipq_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8c& p1, Packet8c& p2) { + const int8x8x2_t tmp = vzip_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16c& p1, Packet16c& p2) { + const int8x16x2_t tmp = vzipq_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8uc& p1, Packet8uc& p2) { + const uint8x8x2_t tmp = vzip_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16uc& p1, Packet16uc& p2) { + const uint8x16x2_t tmp = vzipq_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2i& p1, Packet2i& p2) { + const int32x2x2_t tmp = vzip_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4i& p1, Packet4i& p2) { + const int32x4x2_t tmp = vzipq_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2ui& p1, Packet2ui& p2) { + const uint32x2x2_t tmp = vzip_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4ui& p1, Packet4ui& p2) { + const uint32x4x2_t tmp = vzipq_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4s& p1, Packet4s& p2) { + const int16x4x2_t tmp = vzip_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8s& p1, Packet8s& p2) { + const int16x8x2_t tmp = vzipq_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4us& p1, Packet4us& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8us& p1, Packet8us& p2) { + const uint16x8x2_t tmp = vzipq_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[1]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[4]); + zip_in_place(kernel.packet[1], kernel.packet[5]); + zip_in_place(kernel.packet[2], kernel.packet[6]); + zip_in_place(kernel.packet[3], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[6]); + zip_in_place(kernel.packet[5], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[5]); + zip_in_place(kernel.packet[6], kernel.packet[7]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + EIGEN_UNROLL_LOOP + for (int i=0; i<4; ++i) { + const int m = (1 << i); + EIGEN_UNROLL_LOOP + for (int j=0; j& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); + const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); + + const int8x8x2_t zip8 = vzip_s8(a,b); + const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); + + kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); + const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); + + const uint8x8x2_t zip8 = vzip_u8(a,b); + const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); + + kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::zip_in_place(kernel.packet[0], kernel.packet[1]); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ +#if EIGEN_ARCH_ARM64 + const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = tmp1; +#else + const int64x1_t tmp[2][2] = { + { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) }, + { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) } + }; + + kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]); + kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]); +#endif +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ +#if EIGEN_ARCH_ARM64 + const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = tmp1; +#else + const uint64x1_t tmp[2][2] = { + { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) }, + { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) } + }; + + kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]); + kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]); +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b) +{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) +{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) +{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) +{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) +{ return vbsl_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b) +{ return vbslq_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) +{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) +{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) +{ return vbsl_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) +{ return vbslq_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) +{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) +{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) +{ return vbsl_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) +{ return vbslq_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) +{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) +{ return vbslq_u64(mask, a, b); } + +// Use armv8 rounding intinsics if available. +#if EIGEN_ARCH_ARMV8 +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) +{ return vrndn_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) +{ return vrndnq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) +{ return vrndm_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ return vrndmq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ return vrndp_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ return vrndpq_f32(a); } + +#else + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet2f limit = pset1(static_cast(1<<23)); + const Packet2f abs_a = pabs(a); + Packet2f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If greater, subtract one. + Packet4f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) +{ + const Packet2f cst_1 = pset1(1.0f); + Packet2f tmp = print(a); + // If greater, subtract one. + Packet2f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If smaller, add one. + Packet4f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ + const Packet2f cst_1 = pset1(1.0); + Packet2f tmp = print(a); + // If smaller, add one. + Packet2f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +#endif + +/** + * Computes the integer square root + * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result + * and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument + * value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf . + */ +template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) { + uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t res = vdup_n_u8(0); + uint8x8_t add = vdup_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x8_t temp = vorr_u8(res, add); + res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res); + add = vshr_n_u8(add, 1); + } + return vget_lane_u32(vreinterpret_u32_u8(res), 0); +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) { + uint8x8_t res = vdup_n_u8(0); + uint8x8_t add = vdup_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x8_t temp = vorr_u8(res, add); + res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res); + add = vshr_n_u8(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) { + uint8x16_t res = vdupq_n_u8(0); + uint8x16_t add = vdupq_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x16_t temp = vorrq_u8(res, add); + res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res); + add = vshrq_n_u8(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) { + uint16x4_t res = vdup_n_u16(0); + uint16x4_t add = vdup_n_u16(0x80); + for (int i = 0; i < 8; i++) + { + const uint16x4_t temp = vorr_u16(res, add); + res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res); + add = vshr_n_u16(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) { + uint16x8_t res = vdupq_n_u16(0); + uint16x8_t add = vdupq_n_u16(0x80); + for (int i = 0; i < 8; i++) + { + const uint16x8_t temp = vorrq_u16(res, add); + res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res); + add = vshrq_n_u16(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) { + uint32x2_t res = vdup_n_u32(0); + uint32x2_t add = vdup_n_u32(0x8000); + for (int i = 0; i < 16; i++) + { + const uint32x2_t temp = vorr_u32(res, add); + res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res); + add = vshr_n_u32(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { + uint32x4_t res = vdupq_n_u32(0); + uint32x4_t add = vdupq_n_u32(0x8000); + for (int i = 0; i < 16; i++) + { + const uint32x4_t temp = vorrq_u32(res, add); + res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res); + add = vshrq_n_u32(add, 1); + } + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + // Compute approximate reciprocal sqrt. + Packet4f x = vrsqrteq_f32(a); + // Do Newton iterations for 1/sqrt(x). + x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); + x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); + const Packet4f infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) { + // Compute approximate reciprocal sqrt. + Packet2f x = vrsqrte_f32(a); + // Do Newton iterations for 1/sqrt(x). + x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); + x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); + const Packet2f infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +// Unfortunately vsqrt_f32 is only available for A64. +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);} +template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); } +#else +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + const Packet4f infinity = pset1(NumTraits::infinity()); + const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); + return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); +} +template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { + const Packet2f infinity = pset1(NumTraits::infinity()); + const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); + return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); +} +#endif + +//---------- bfloat16 ---------- +// TODO: Add support for native armv8.6-a bfloat16_t + +// TODO: Guard if we have native bfloat16 support +typedef eigen_packet_wrapper Packet4bf; + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4bf type; + typedef Packet4bf half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 + }; +}; + +template<> struct unpacket_traits +{ + typedef bfloat16 type; + typedef Packet4bf half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +namespace detail { +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4bf& p1, Packet4bf& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} +} // namespace detail + +EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) +{ + // See the scalar implemention in BFloat16.h for a comprehensible explanation + // of this fast rounding algorithm + Packet4ui input = reinterpret_cast(p); + + // lsb = (input >> 16) & 1 + Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1)); + + // rounding_bias = 0x7fff + lsb + Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff)); + + // input += rounding_bias + input = vaddq_u32(input, rounding_bias); + + // input = input >> 16 + input = vshrq_n_u32(input, 16); + + // Replace float-nans by bfloat16-nans, that is 0x7fc0 + const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0); + const Packet4ui mask = vceqq_f32(p, p); + input = vbslq_u32(mask, input, bf16_nan); + + // output = static_cast(input) + return vmovn_u32(input); +} + +EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) +{ + return reinterpret_cast(vshlq_n_u32(vmovl_u16(p), 16)); +} + +EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { + return vmovn_u32(vreinterpretq_u32_f32(p)); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pset1(const bfloat16& from) { + return pset1(from.value); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { + return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(from))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) +{ + return pload(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) +{ + return ploadu(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) +{ + EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet4bf& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet4bf ploaddup(const bfloat16* from) +{ + return ploaddup(reinterpret_cast(from)); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) { + return F32ToBf16(pabs(Bf16ToF32(a))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf plset(const bfloat16& a) +{ + return F32ToBf16(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) { + return por(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) { + return pxor(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) { + return pand(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) { + return pandnot(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, + const Packet4bf& b) +{ + return pselect(mask, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) +{ + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pfloor(const Packet4bf& a) +{ + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pceil(const Packet4bf& a) +{ + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4bf padd(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf psub(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pmul(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pdiv(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> +EIGEN_STRONG_INLINE Packet4bf pgather(const bfloat16* from, Index stride) +{ + return pgather(reinterpret_cast(from), stride); +} + +template<> +EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet4bf& from, Index stride) +{ + pscatter(reinterpret_cast(to), from, stride); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet4bf& a) +{ + return static_cast(predux(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet4bf& a) +{ + return static_cast(predux_max(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet4bf& a) +{ + return static_cast(predux_min(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet4bf& a) +{ + return static_cast(predux_mul(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) +{ + return preverse(a); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + detail::ptranspose_impl(kernel); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff(const Packet4bf& a, const Packet4bf& b) +{ + return F32ToBf16(pabsdiff(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) +{ + return pxor(a, pset1(static_cast(0x8000))); +} + +//---------- double ---------- + +// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double. +// Confirmed at least with __apple_build_version__ = 6000054. +#ifdef __apple_build_version__ +// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed. +// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with +// major toolchain updates. +#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) +#else +#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 +#endif + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG + +// Bug 907: workaround missing declarations of the following two functions in the ADK +// Defining these functions as templates ensures that if these intrinsics are +// already defined in arm_neon.h, then our workaround doesn't cause a conflict +// and has lower priority in overload resolution. +template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } + +template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } + +typedef float64x2_t Packet2d; +typedef float64x1_t Packet1d; + +// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) +// Currently used in LU/arch/InverseSize4.h to enable a shared implementation +// for fast inversion of matrices of size 4. +EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) +{ + const double* a = reinterpret_cast(&m); + const double* b = reinterpret_cast(&n); + Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + return res; +} + +EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) +{ + return shuffle(a, b, mask); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 0); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 3); +} +#define vec2d_duplane(a, p) \ + vdupq_laneq_f64(a, p) + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = 0, + HasErf = 0 + }; +}; + +template<> struct unpacket_traits +{ + typedef double type; + typedef Packet2d half; + typedef Packet2l integer_packet; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) +{ + const double c[] = {0.0,1.0}; + return vaddq_f64(pset1(a), vld1q_f64(c)); +} + +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ + const Packet2d mask = {numext::bit_cast(0x8000000000000000ull),0.0}; + return padd(a, pxor(mask, b)); +} + +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } + +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vfmaq_f64(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vmlaq_f64(c,a,b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); } + +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return pmax(a, b); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { return vld1q_dup_f64(from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather(const double* from, Index stride) +{ + Packet2d res = pset1(0.0); + res = vld1q_lane_f64(from + 0*stride, res, 0); + res = vld1q_lane_f64(from + 1*stride, res, 1); + return res; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(double* to, const Packet2d& from, Index stride) +{ + vst1q_lane_f64(to + stride*0, from, 0); + vst1q_lane_f64(to + stride*1, from, 1); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } + +// FIXME only store the 2 first elements ? +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a,0); } + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } + +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } + +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif + +// Other reduction functions: +// mul +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif + +// min +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ return vgetq_lane_f64(vpminq_f64(a,a), 0); } + +// max +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); } + + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ + const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); + + kernel.packet[0] = tmp1; + kernel.packet[1] = tmp2; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b) +{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) +{ return vrndnq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ return vrndmq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ return vrndpq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) +{ return pldexp_generic(a, exponent); } + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) +{ return pfrexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) +{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); } + +template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + // Compute approximate reciprocal sqrt. + Packet2d x = vrsqrteq_f64(a); + // Do Newton iterations for 1/sqrt(x). + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + const Packet2d infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } + +#endif // EIGEN_ARCH_ARM64 + +// Do we have an fp16 types and supporting Neon intrinsics? +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +typedef float16x4_t Packet4hf; +typedef float16x8_t Packet8hf; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8hf type; + typedef Packet4hf half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = 4, - HasHalfPacket=0, // Packet2f intrinsics not implemented yet - - HasDiv = 1, - // FIXME check the Has* - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, - HasSqrt = 0 + size = 8, + HasHalfPacket = 1, + + HasCmp = 1, + HasCast = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasInsert = 1, + HasReduxp = 1, + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasRsqrt = 1, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 }; }; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4i type; - typedef Packet4i half; // Packet2i intrinsics not implemented yet + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet4hf half; enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket=0 // Packet2i intrinsics not implemented yet - // FIXME check the Has* + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false }; }; -#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM -// workaround gcc 4.2, 4.3 and 4.4 compilatin issue -EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } -EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } -#endif - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; - -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet4hf half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) -{ - const float f[] = {0, 1, 2, 3}; - Packet4f countdown = vld1q_f32(f); - return vaddq_f32(pset1(a), countdown); +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4(const Packet8hf& a) { + return vadd_f16(vget_low_f16(a), vget_high_f16(a)); } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) -{ - const int32_t i[] = {0, 1, 2, 3}; - Packet4i countdown = vld1q_s32(i); - return vaddq_s32(pset1(a), countdown); + +template <> +EIGEN_STRONG_INLINE Packet8hf pset1(const Eigen::half& from) { + return vdupq_n_f16(from.x); } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf pset1(const Eigen::half& from) { + return vdup_n_f16(from.x); +} -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet8hf plset(const Eigen::half& a) { + const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7}; + Packet8hf countdown = vld1q_f16(f); + return vaddq_f16(pset1(a), countdown); +} -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } +template <> +EIGEN_STRONG_INLINE Packet4hf plset(const Eigen::half& a) { + const float16_t f[] = {0, 1, 2, 3}; + Packet4hf countdown = vld1_f16(f); + return vadd_f16(pset1(a), countdown); +} -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template <> +EIGEN_STRONG_INLINE Packet8hf padd(const Packet8hf& a, const Packet8hf& b) { + return vaddq_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf padd(const Packet4hf& a, const Packet4hf& b) { + return vadd_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ -#if EIGEN_ARCH_ARM64 - return vdivq_f32(a,b); -#else - Packet4f inv, restep, div; +template <> +EIGEN_STRONG_INLINE Packet8hf psub(const Packet8hf& a, const Packet8hf& b) { + return vsubq_f16(a, b); +} - // NEON does not offer a divide instruction, we have to do a reciprocal approximation - // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers - // a reciprocal estimate AND a reciprocal step -which saves a few instructions - // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with - // Newton-Raphson and vrecpsq_f32() - inv = vrecpeq_f32(b); +template <> +EIGEN_STRONG_INLINE Packet4hf psub(const Packet4hf& a, const Packet4hf& b) { + return vsub_f16(a, b); +} - // This returns a differential, by which we will have to multiply inv to get a better - // approximation of 1/b. - restep = vrecpsq_f32(b, inv); - inv = vmulq_f32(restep, inv); +template <> +EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) { + return vnegq_f16(a); +} - // Finally, multiply a by 1/b and get the wanted result of the division. - div = vmulq_f32(a, inv); +template <> +EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) { + return vneg_f16(a); +} - return div; -#endif +template <> +EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) { + return a; } -template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); +template <> +EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) { + return a; } -// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, -// then implements a slow software scalar fallback calling fmaf()! -// Filed LLVM bug: -// https://llvm.org/bugs/show_bug.cgi?id=27216 -#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) -// See bug 936. -// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. -// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. -// MLA is not fused i.e. does 2 roundings. -// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): -// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } -#else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { -#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM - // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, - // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on - // -march=armv7-a, that is a very common case. - // See e.g. this thread: - // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html - // Filed LLVM bug: - // https://llvm.org/bugs/show_bug.cgi?id=27219 - Packet4f r = c; - asm volatile( - "vmla.f32 %q[r], %q[a], %q[b]" - : [r] "+w" (r) - : [a] "w" (a), - [b] "w" (b) - : ); - return r; -#else - return vmlaq_f32(c,a,b); -#endif +template <> +EIGEN_STRONG_INLINE Packet8hf pmul(const Packet8hf& a, const Packet8hf& b) { + return vmulq_f16(a, b); } -#endif -// No FMA instruction for int, so use MLA unconditionally. -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf pmul(const Packet4hf& a, const Packet4hf& b) { + return vmul_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet8hf pdiv(const Packet8hf& a, const Packet8hf& b) { + return vdivq_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf pdiv(const Packet4hf& a, const Packet4hf& b) { + return vdiv_f16(a, b); +} -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) { + return vfmaq_f16(c, a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) { + return vfma_f16(c, a, b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { + return vminq_f16(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { + return vmin_f16(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); } +template<> EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); } +#endif -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { return pmin(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - float32x2_t lo, hi; - lo = vld1_dup_f32(from); - hi = vld1_dup_f32(from+1); - return vcombine_f32(lo, hi); +template<> EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { return pmin(a, b); } + +template <> +EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { + return vmaxq_f16(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) -{ - int32x2_t lo, hi; - lo = vld1_dup_s32(from); - hi = vld1_dup_s32(from+1); - return vcombine_s32(lo, hi); + +template <> +EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { + return vmax_f16(a, b); } -template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); } +template<> EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); } +#endif -template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { return pmax(a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(from[0*stride], res, 0); - res = vsetq_lane_f32(from[1*stride], res, 1); - res = vsetq_lane_f32(from[2*stride], res, 2); - res = vsetq_lane_f32(from[3*stride], res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) -{ - Packet4i res = pset1(0); - res = vsetq_lane_s32(from[0*stride], res, 0); - res = vsetq_lane_s32(from[1*stride], res, 1); - res = vsetq_lane_s32(from[2*stride], res, 2); - res = vsetq_lane_s32(from[3*stride], res, 3); - return res; -} +template<> EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { return pmax(a, b); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - to[stride*0] = vgetq_lane_f32(from, 0); - to[stride*1] = vgetq_lane_f32(from, 1); - to[stride*2] = vgetq_lane_f32(from, 2); - to[stride*3] = vgetq_lane_f32(from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) -{ - to[stride*0] = vgetq_lane_s32(from, 0); - to[stride*1] = vgetq_lane_s32(from, 1); - to[stride*2] = vgetq_lane_s32(from, 2); - to[stride*3] = vgetq_lane_s32(from, 3); -} +#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \ + template <> \ + EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \ + return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \ + } -template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +#define EIGEN_MAKE_ARM_FP16_CMP_4(name) \ + template <> \ + EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \ + return vreinterpret_f16_u16(vc##name##_f16(a, b)); \ + } -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +EIGEN_MAKE_ARM_FP16_CMP_8(eq) +EIGEN_MAKE_ARM_FP16_CMP_8(lt) +EIGEN_MAKE_ARM_FP16_CMP_8(le) -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { - float32x2_t a_lo, a_hi; - Packet4f a_r64; +EIGEN_MAKE_ARM_FP16_CMP_4(eq) +EIGEN_MAKE_ARM_FP16_CMP_4(lt) +EIGEN_MAKE_ARM_FP16_CMP_4(le) - a_r64 = vrev64q_f32(a); - a_lo = vget_low_f32(a_r64); - a_hi = vget_high_f32(a_r64); - return vcombine_f32(a_hi, a_lo); +#undef EIGEN_MAKE_ARM_FP16_CMP_8 +#undef EIGEN_MAKE_ARM_FP16_CMP_4 + +template <> +EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b))); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - int32x2_t a_lo, a_hi; - Packet4i a_r64; - a_r64 = vrev64q_s32(a); - a_lo = vget_low_s32(a_r64); - a_hi = vget_high_s32(a_r64); - return vcombine_s32(a_hi, a_lo); +template <> +EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b))); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } +template <> +EIGEN_STRONG_INLINE Packet8hf print(const Packet8hf& a) +{ return vrndnq_f16(a); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf print(const Packet4hf& a) +{ return vrndn_f16(a); } - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - sum = vpadd_f32(a_lo, a_hi); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); -} +template <> +EIGEN_STRONG_INLINE Packet8hf pfloor(const Packet8hf& a) +{ return vrndmq_f16(a); } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - float32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4f sum1, sum2, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf pfloor(const Packet4hf& a) +{ return vrndm_f16(a); } - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_f32(vecs[0], vecs[2]); - vtrn2 = vzipq_f32(vecs[1], vecs[3]); - res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); +template <> +EIGEN_STRONG_INLINE Packet8hf pceil(const Packet8hf& a) +{ return vrndpq_f16(a); } - // Do the addition of the resulting vectors - sum1 = vaddq_f32(res1.val[0], res1.val[1]); - sum2 = vaddq_f32(res2.val[0], res2.val[1]); - sum = vaddq_f32(sum1, sum2); +template <> +EIGEN_STRONG_INLINE Packet4hf pceil(const Packet4hf& a) +{ return vrndp_f16(a); } - return sum; +template <> +EIGEN_STRONG_INLINE Packet8hf psqrt(const Packet8hf& a) { + return vsqrtq_f16(a); } -template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf psqrt(const Packet4hf& a) { + return vsqrt_f16(a); +} - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - sum = vpadd_s32(a_lo, a_hi); - sum = vpadd_s32(sum, sum); - return vget_lane_s32(sum, 0); +template <> +EIGEN_STRONG_INLINE Packet8hf pand(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); } -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - int32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4i sum1, sum2, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf pand(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_s32(vecs[0], vecs[2]); - vtrn2 = vzipq_s32(vecs[1], vecs[3]); - res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); +template <> +EIGEN_STRONG_INLINE Packet8hf por(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} - // Do the addition of the resulting vectors - sum1 = vaddq_s32(res1.val[0], res1.val[1]); - sum2 = vaddq_s32(res2.val[0], res2.val[1]); - sum = vaddq_s32(sum1, sum2); +template <> +EIGEN_STRONG_INLINE Packet4hf por(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} - return sum; +template <> +EIGEN_STRONG_INLINE Packet8hf pxor(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); } -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, prod; +template <> +EIGEN_STRONG_INLINE Packet4hf pxor(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_f32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_f32(prod, vrev64_f32(prod)); +template <> +EIGEN_STRONG_INLINE Packet8hf pandnot(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} - return vget_lane_f32(prod, 0); +template <> +EIGEN_STRONG_INLINE Packet4hf pandnot(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); } -template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, prod; - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_s32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_s32(prod, vrev64_s32(prod)); +template <> +EIGEN_STRONG_INLINE Packet8hf pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); +} - return vget_lane_s32(prod, 0); +template <> +EIGEN_STRONG_INLINE Packet4hf pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); } -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, min; +template <> +EIGEN_STRONG_INLINE Packet8hf ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); +} - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - min = vpmin_f32(a_lo, a_hi); - min = vpmin_f32(min, min); +template <> +EIGEN_STRONG_INLINE Packet4hf ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); +} - return vget_lane_f32(min, 0); +template <> +EIGEN_STRONG_INLINE Packet8hf ploaddup(const Eigen::half* from) { + Packet8hf packet; + packet[0] = from[0].x; + packet[1] = from[0].x; + packet[2] = from[1].x; + packet[3] = from[1].x; + packet[4] = from[2].x; + packet[5] = from[2].x; + packet[6] = from[3].x; + packet[7] = from[3].x; + return packet; } -template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, min; +template <> +EIGEN_STRONG_INLINE Packet4hf ploaddup(const Eigen::half* from) { + float16x4_t packet; + float16_t* tmp; + tmp = (float16_t*)&packet; + tmp[0] = from[0].x; + tmp[1] = from[0].x; + tmp[2] = from[1].x; + tmp[3] = from[1].x; + return packet; +} - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - min = vpmin_s32(a_lo, a_hi); - min = vpmin_s32(min, min); - - return vget_lane_s32(min, 0); +template <> +EIGEN_STRONG_INLINE Packet8hf ploadquad(const Eigen::half* from) { + Packet4hf lo, hi; + lo = vld1_dup_f16(reinterpret_cast(from)); + hi = vld1_dup_f16(reinterpret_cast(from+1)); + return vcombine_f16(lo, hi); } -// max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, max; +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); } - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - max = vpmax_f32(a_lo, a_hi); - max = vpmax_f32(max, max); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); } - return vget_lane_f32(max, 0); +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) { + return vbslq_f16(vreinterpretq_u16_f16(mask), a, b); } -template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, max; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - max = vpmax_s32(a_lo, a_hi); - max = vpmax_s32(max, max); - - return vget_lane_s32(max, 0); -} - -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -PALIGN_NEON(0,Packet4f,vextq_f32) -PALIGN_NEON(1,Packet4f,vextq_f32) -PALIGN_NEON(2,Packet4f,vextq_f32) -PALIGN_NEON(3,Packet4f,vextq_f32) -PALIGN_NEON(0,Packet4i,vextq_s32) -PALIGN_NEON(1,Packet4i,vextq_s32) -PALIGN_NEON(2,Packet4i,vextq_s32) -PALIGN_NEON(3,Packet4i,vextq_s32) - -#undef PALIGN_NEON - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); - - kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); - kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); - kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); - kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); - kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); - kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); - kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) { + return vbsl_f16(vreinterpret_u16_f16(mask), a, b); } -//---------- double ---------- - -// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double. -// Confirmed at least with __apple_build_version__ = 6000054. -#ifdef __apple_build_version__ -// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed. -// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with -// major toolchain updates. -#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) -#else -#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 -#endif +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); } -#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); } -// Bug 907: workaround missing declarations of the following two functions in the ADK -// Defining these functions as templates ensures that if these intrinsics are -// already defined in arm_neon.h, then our workaround doesn't cause a conflict -// and has lower priority in overload resolution. -template -uint64x2_t vreinterpretq_u64_f64(T a) -{ - return (uint64x2_t) a; +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8hf& from) { + EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); } -template -float64x2_t vreinterpretq_f64_u64(T a) -{ - return (float64x2_t) a; +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4hf& from) { + EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast(to), from); } -typedef float64x2_t Packet2d; -typedef float64x1_t Packet1d; - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet2d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket=0, - - HasDiv = 1, - // FIXME check the Has* - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 0, - HasSqrt = 0 - }; -}; +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8hf& from) { + EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); +} -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4hf& from) { + EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast(to), from); +} -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather(const Eigen::half* from, Index stride) { + Packet8hf res = pset1(Eigen::half(0.f)); + res = vsetq_lane_f16(from[0 * stride].x, res, 0); + res = vsetq_lane_f16(from[1 * stride].x, res, 1); + res = vsetq_lane_f16(from[2 * stride].x, res, 2); + res = vsetq_lane_f16(from[3 * stride].x, res, 3); + res = vsetq_lane_f16(from[4 * stride].x, res, 4); + res = vsetq_lane_f16(from[5 * stride].x, res, 5); + res = vsetq_lane_f16(from[6 * stride].x, res, 6); + res = vsetq_lane_f16(from[7 * stride].x, res, 7); + return res; +} -template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) -{ - const double countdown_raw[] = {0.0,1.0}; - const Packet2d countdown = vld1q_f64(countdown_raw); - return vaddq_f64(pset1(a), countdown); +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather(const Eigen::half* from, Index stride) { + Packet4hf res = pset1(Eigen::half(0.f)); + res = vset_lane_f16(from[0 * stride].x, res, 0); + res = vset_lane_f16(from[1 * stride].x, res, 1); + res = vset_lane_f16(from[2 * stride].x, res, 2); + res = vset_lane_f16(from[3 * stride].x, res, 3); + return res; } -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8hf& from, Index stride) { + to[stride * 0].x = vgetq_lane_f16(from, 0); + to[stride * 1].x = vgetq_lane_f16(from, 1); + to[stride * 2].x = vgetq_lane_f16(from, 2); + to[stride * 3].x = vgetq_lane_f16(from, 3); + to[stride * 4].x = vgetq_lane_f16(from, 4); + to[stride * 5].x = vgetq_lane_f16(from, 5); + to[stride * 6].x = vgetq_lane_f16(from, 6); + to[stride * 7].x = vgetq_lane_f16(from, 7); +} -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4hf& from, Index stride) { + to[stride * 0].x = vget_lane_f16(from, 0); + to[stride * 1].x = vget_lane_f16(from, 1); + to[stride * 2].x = vget_lane_f16(from, 2); + to[stride * 3].x = vget_lane_f16(from, 3); +} -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template <> +EIGEN_STRONG_INLINE void prefetch(const Eigen::half* addr) { + EIGEN_ARM_PREFETCH(addr); +} -template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8hf& a) { + float16_t x[8]; + vst1q_f16(x, a); + Eigen::half h; + h.x = x[0]; + return h; +} -template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4hf& a) { + float16_t x[4]; + vst1_f16(x, a); + Eigen::half h; + h.x = x[0]; + return h; +} -#ifdef __ARM_FEATURE_FMA -// See bug 936. See above comment about FMA for float. -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } -#else -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } -#endif +template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) { + float16x4_t a_lo, a_hi; + Packet8hf a_r64; -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } + a_r64 = vrev64q_f16(a); + a_lo = vget_low_f16(a_r64); + a_hi = vget_high_f16(a_r64); + return vcombine_f16(a_hi, a_lo); +} -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf preverse(const Packet4hf& a) { + return vrev64_f16(a); +} -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Packet8hf pabs(const Packet8hf& a) { + return vabsq_f16(a); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Packet4hf pabs(const Packet4hf& a) { + return vabs_f16(a); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const Packet8hf& a) { + float16x4_t a_lo, a_hi, sum; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + sum = vpadd_f16(a_lo, a_hi); + sum = vpadd_f16(sum, sum); + sum = vpadd_f16(sum, sum); + + Eigen::half h; + h.x = vget_lane_f16(sum, 0); + return h; } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const Packet4hf& a) { + float16x4_t sum; + + sum = vpadd_f16(a, a); + sum = vpadd_f16(sum, sum); + Eigen::half h; + h.x = vget_lane_f16(sum, 0); + return h; } -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8hf& a) { + float16x4_t a_lo, a_hi, prod; -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + prod = vmul_f16(a_lo, a_hi); + prod = vmul_f16(prod, vrev64_f16(prod)); -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ - return vld1q_dup_f64(from); + Eigen::half h; + h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1)); + return h; } -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet4hf& a) { + float16x4_t prod; + prod = vmul_f16(a, vrev64_f16(a)); + Eigen::half h; + h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1)); + return h; +} -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) -{ - Packet2d res = pset1(0.0); - res = vsetq_lane_f64(from[0*stride], res, 0); - res = vsetq_lane_f64(from[1*stride], res, 1); - return res; +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8hf& a) { + float16x4_t a_lo, a_hi, min; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + min = vpmin_f16(a_lo, a_hi); + min = vpmin_f16(min, min); + min = vpmin_f16(min, min); + + Eigen::half h; + h.x = vget_lane_f16(min, 0); + return h; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) -{ - to[stride*0] = vgetq_lane_f64(from, 0); - to[stride*1] = vgetq_lane_f64(from, 1); + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet4hf& a) { + Packet4hf tmp; + tmp = vpmin_f16(a, a); + tmp = vpmin_f16(tmp, tmp); + Eigen::half h; + h.x = vget_lane_f16(tmp, 0); + return h; } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8hf& a) { + float16x4_t a_lo, a_hi, max; -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + max = vpmax_f16(a_lo, a_hi); + max = vpmax_f16(max, max); + max = vpmax_f16(max, max); -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } + Eigen::half h; + h.x = vget_lane_f16(max, 0); + return h; +} -#if EIGEN_COMP_CLANG && defined(__apple_build_version__) -// workaround ICE, see bug 907 -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } -#else -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } -#endif +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet4hf& a) { + Packet4hf tmp; + tmp = vpmax_f16(a, a); + tmp = vpmax_f16(tmp, tmp); + Eigen::half h; + h.x = vget_lane_f16(tmp, 0); + return h; +} -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - float64x2_t trn1, trn2; + const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]); + const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]); - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - trn1 = vzip1q_f64(vecs[0], vecs[1]); - trn2 = vzip2q_f64(vecs[0], vecs[1]); + const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0])); + const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1])); - // Do the addition of the resulting vectors - return vaddq_f64(trn1, trn2); + kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]); + kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]); + kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]); + kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]); } -// Other reduction functions: -// mul -#if EIGEN_COMP_CLANG && defined(__apple_build_version__) -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } -#else -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } -#endif -// min -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN16 float16x4x4_t tmp_x4; + float16_t* tmp = (float16_t*)&kernel; + tmp_x4 = vld4_f16(tmp); -// max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } - -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -PALIGN_NEON(0,Packet2d,vextq_f64) -PALIGN_NEON(1,Packet2d,vextq_f64) -#undef PALIGN_NEON - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); - float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); - - kernel.packet[0] = trn1; - kernel.packet[1] = trn2; -} -#endif // EIGEN_ARCH_ARM64 + kernel.packet[0] = tmp_x4.val[0]; + kernel.packet[1] = tmp_x4.val[1]; + kernel.packet[2] = tmp_x4.val[2]; + kernel.packet[3] = tmp_x4.val[3]; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + float16x8x2_t T_1[4]; + + T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]); + T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]); + T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]); + T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]); + + float16x8x2_t T_2[4]; + T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]); + T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]); + T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]); + T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]); + + float16x8x2_t T_3[4]; + T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]); + T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]); + T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]); + T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]); + + kernel.packet[0] = T_3[0].val[0]; + kernel.packet[1] = T_3[2].val[0]; + kernel.packet[2] = T_3[1].val[0]; + kernel.packet[3] = T_3[3].val[0]; + kernel.packet[4] = T_3[0].val[1]; + kernel.packet[5] = T_3[2].val[1]; + kernel.packet[6] = T_3[1].val[1]; + kernel.packet[7] = T_3[3].val[1]; +} +#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/NEON/TypeCasting.h b/inst/include/Eigen/src/Core/arch/NEON/TypeCasting.h new file mode 100644 index 00000000..54f97336 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -0,0 +1,1419 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Rasmus Munk Larsen +// Copyright (C) 2020 Antonio Sanchez +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_NEON_H +#define EIGEN_TYPE_CASTING_NEON_H + +namespace Eigen { + +namespace internal { + +//============================================================================== +// pcast, SrcType = float +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4f& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2f& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +// If float64 exists, first convert to that to keep as much precision as possible. +#if EIGEN_ARCH_ARM64 +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4f& a) { + // Discard second half of input. + return vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4f& a) { + // Discard second half of input. + return vcvtq_u64_f64(vcvt_f64_f32(vget_low_f32(a))); +} +#else +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4f& a) { + // Discard second half of input. + return vmovl_s32(vget_low_s32(vcvtq_s32_f32(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4f& a) { + // Discard second half of input. + return vmovl_u32(vget_low_u32(vcvtq_u32_f32(a))); +} +#endif // EIGEN_ARCH_ARM64 + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vcvtq_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2f& a) { + return vcvt_s32_f32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { + return vcvtq_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2f& a) { + return vcvt_u32_f32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4f& a, const Packet4f& b) { + return vcombine_s16(vmovn_s32(vcvtq_s32_f32(a)), vmovn_s32(vcvtq_s32_f32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2f& a, const Packet2f& b) { + return vmovn_s32(vcombine_s32(vcvt_s32_f32(a), vcvt_s32_f32(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4f& a, const Packet4f& b) { + return vcombine_u16(vmovn_u32(vcvtq_u32_f32(a)), vmovn_u32(vcvtq_u32_f32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2f& a, const Packet2f& b) { + return vmovn_u32(vcombine_u32(vcvt_u32_f32(a), vcvt_u32_f32(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + const int16x8_t ab_s16 = pcast(a, b); + const int16x8_t cd_s16 = pcast(c, d); + return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2f& a, const Packet2f& b, const Packet2f& c, + const Packet2f& d) { + const int16x4_t ab_s16 = pcast(a, b); + const int16x4_t cd_s16 = pcast(c, d); + return vmovn_s16(vcombine_s16(ab_s16, cd_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + const uint16x8_t ab_u16 = pcast(a, b); + const uint16x8_t cd_u16 = pcast(c, d); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2f& a, const Packet2f& b, const Packet2f& c, + const Packet2f& d) { + const uint16x4_t ab_u16 = pcast(a, b); + const uint16x4_t cd_u16 = pcast(c, d); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +//============================================================================== +// pcast, SrcType = int8_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16c& a) { + // Discard all but first 4 bytes. + return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet8c& a) { + // Discard all but first 2 bytes. + return vcvt_f32_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16c& a) { + // Discard all but first two bytes. + return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16c& a) { + // Discard all but first 4 bytes. + return vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet8c& a) { + // Discard all but first 2 bytes. + return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { + return vreinterpretq_u32_s32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8c& a) { + return vreinterpret_u32_s32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16c& a) { + // Discard second half of input. + return vmovl_s8(vget_low_s8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet8c& a) { + // Discard second half of input. + return vget_low_s16(vmovl_s8(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { + return vreinterpretq_u16_s16(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet8c& a) { + return vreinterpret_u16_s16(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet16c& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet8c& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4c pcast(const Packet4c& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) { + return vreinterpretq_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { + return vreinterpret_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) { + return static_cast(a); +} + +//============================================================================== +// pcast, SrcType = uint8_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16uc& a) { + // Discard all but first 4 bytes. + return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet8uc& a) { + // Discard all but first 2 bytes. + return vcvt_f32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16uc& a) { + // Discard all but first two bytes. + return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16uc& a) { + // Discard all but first 4 bytes. + return vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8uc& a) { + // Discard all but first 2 bytes. + return vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { + return vreinterpretq_s32_u32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet8uc& a) { + return vreinterpret_s32_u32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16uc& a) { + // Discard second half of input. + return vmovl_u8(vget_low_u8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet8uc& a) { + // Discard second half of input. + return vget_low_u16(vmovl_u8(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { + return vreinterpretq_s16_u16(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet8uc& a) { + return vreinterpret_s16_u16(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4uc& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) { + return vreinterpretq_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { + return vreinterpret_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) { + return static_cast(a); +} + +//============================================================================== +// pcast, SrcType = int16_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8s& a) { + // Discard second half of input. + return vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet4s& a) { + // Discard second half of input. + return vcvt_f32_s32(vget_low_s32(vmovl_s16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8s& a) { + // Discard all but first two values. + return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8s& a) { + // Discard second half of input. + return vmovl_s16(vget_low_s16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet4s& a) { + // Discard second half of input. + return vget_low_s32(vmovl_s16(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { + return vreinterpretq_u32_s32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4s& a) { + return vreinterpret_u32_s32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet8s& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet4s& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) { + return vreinterpretq_u16_s16(a); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) { + return vreinterpret_u16_s16(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8s& a, const Packet8s& b) { + return vcombine_s8(vmovn_s16(a), vmovn_s16(b)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet4s& a, const Packet4s& b) { + return vmovn_s16(vcombine_s16(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8s& a, const Packet8s& b) { + return vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(a)), vmovn_u16(vreinterpretq_u16_s16(b))); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet4s& a, const Packet4s& b) { + return vmovn_u16(vcombine_u16(vreinterpret_u16_s16(a), vreinterpret_u16_s16(b))); +} + +//============================================================================== +// pcast, SrcType = uint16_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8us& a) { + // Discard second half of input. + return vcvtq_f32_u32(vmovl_u16(vget_low_u16(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet4us& a) { + // Discard second half of input. + return vcvt_f32_u32(vget_low_u32(vmovl_u16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8us& a) { + // Discard all but first two values. + return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8us& a) { + // Discard second half of input. + return vmovl_u16(vget_low_u16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4us& a) { + // Discard second half of input. + return vget_low_u32(vmovl_u16(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { + return vreinterpretq_s32_u32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet4us& a) { + return vreinterpret_s32_u32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet8us& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet4us& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) { + return vreinterpretq_s16_u16(a); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) { + return vreinterpret_s16_u16(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8us& a, const Packet8us& b) { + return vcombine_u8(vmovn_u16(a), vmovn_u16(b)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet4us& a, const Packet4us& b) { + return vmovn_u16(vcombine_u16(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { + return vreinterpretq_s8_u8(pcast(a, b)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet4us& a, const Packet4us& b) { + return vreinterpret_s8_u8(pcast(a, b)); +} + +//============================================================================== +// pcast, SrcType = int32_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vcvtq_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2i& a) { + return vcvt_f32_s32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4i& a) { + // Discard second half of input. + return vmovl_s32(vget_low_s32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4i& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2i& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) { + return vreinterpretq_u32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) { + return vreinterpret_u32_s32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4i& a, const Packet4i& b) { + return vcombine_s16(vmovn_s32(a), vmovn_s32(b)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2i& a, const Packet2i& b) { + return vmovn_s32(vcombine_s32(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4i& a, const Packet4i& b) { + return vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(a)), vmovn_u32(vreinterpretq_u32_s32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2i& a, const Packet2i& b) { + return vmovn_u32(vreinterpretq_u32_s32(vcombine_s32(a, b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + const int16x8_t ab_s16 = pcast(a, b); + const int16x8_t cd_s16 = pcast(c, d); + return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2i& a, const Packet2i& b, const Packet2i& c, + const Packet2i& d) { + const int16x4_t ab_s16 = vmovn_s32(vcombine_s32(a, b)); + const int16x4_t cd_s16 = vmovn_s32(vcombine_s32(c, d)); + return vmovn_s16(vcombine_s16(ab_s16, cd_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + const uint16x8_t ab_u16 = pcast(a, b); + const uint16x8_t cd_u16 = pcast(c, d); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2i& a, const Packet2i& b, const Packet2i& c, + const Packet2i& d) { + const uint16x4_t ab_u16 = pcast(a, b); + const uint16x4_t cd_u16 = pcast(c, d); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +//============================================================================== +// pcast, SrcType = uint32_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { + return vcvtq_f32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2ui& a) { + return vcvt_f32_u32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4ui& a) { + // Discard second half of input. + return vmovl_u32(vget_low_u32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4ui& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2ui& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) { + return vreinterpretq_s32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) { + return vreinterpret_s32_u32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4ui& a, const Packet4ui& b) { + return vcombine_u16(vmovn_u32(a), vmovn_u32(b)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2ui& a, const Packet2ui& b) { + return vmovn_u32(vcombine_u32(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { + return vreinterpretq_s16_u16(pcast(a, b)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2ui& a, const Packet2ui& b) { + return vreinterpret_s16_u16(pcast(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + const uint16x8_t ab_u16 = vcombine_u16(vmovn_u32(a), vmovn_u32(b)); + const uint16x8_t cd_u16 = vcombine_u16(vmovn_u32(c), vmovn_u32(d)); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, + const Packet2ui& d) { + const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(a, b)); + const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(c, d)); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + return vreinterpretq_s8_u8(pcast(a, b, c, d)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, + const Packet2ui& d) { + return vreinterpret_s8_u8(pcast(a, b, c, d)); +} + +//============================================================================== +// pcast, SrcType = int64_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2l& a, const Packet2l& b) { + return vcvtq_f32_s32(vcombine_s32(vmovn_s64(a), vmovn_s64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2l& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) { + return vreinterpretq_u64_s64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2l& a, const Packet2l& b) { + return vcombine_s32(vmovn_s64(a), vmovn_s64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2l& a, const Packet2l& b) { + return vcombine_u32(vmovn_u64(vreinterpretq_u64_s64(a)), vmovn_u64(vreinterpretq_u64_s64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + const int32x4_t ab_s32 = pcast(a, b); + const int32x4_t cd_s32 = pcast(c, d); + return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + const uint32x4_t ab_u32 = pcast(a, b); + const uint32x4_t cd_u32 = pcast(c, d); + return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const int16x8_t abcd_s16 = pcast(a, b, c, d); + const int16x8_t efgh_s16 = pcast(e, f, g, h); + return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +//============================================================================== +// pcast, SrcType = uint64_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2ul& a, const Packet2ul& b) { + return vcvtq_f32_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2ul& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) { + return vreinterpretq_s64_u64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u32(vmovn_u64(a), vmovn_u64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { + return vreinterpretq_s32_u32(pcast(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b))); + const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(vmovn_u64(c), vmovn_u64(d))); + return vcombine_u16(ab_u16, cd_u16); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + return vreinterpretq_s16_u16(pcast(a, b, c, d)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + return vreinterpretq_s8_u8(pcast(a, b, c, d, e, f, g, h)); +} + +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { + return vreinterpret_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { + return vreinterpret_f32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return vreinterpretq_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return vreinterpretq_f32_u32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { + return vreinterpret_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return vreinterpretq_s8_u8(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { + return vreinterpret_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return vreinterpretq_u8_s8(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { + return vreinterpret_s16_u16(a); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return vreinterpretq_s16_u16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { + return vreinterpret_u16_s16(a); +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return vreinterpretq_u16_s16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { + return vreinterpret_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { + return vreinterpret_s32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return vreinterpretq_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return vreinterpretq_s32_u32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { + return vreinterpret_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { + return vreinterpret_u32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return vreinterpretq_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return vreinterpretq_u32_s32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { + return vreinterpretq_s64_u64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return vreinterpretq_u64_s64(a); +} + +#if EIGEN_ARCH_ARM64 + +//============================================================================== +// pcast/preinterpret, Double +//============================================================================== + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2d& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { + return vcvtq_s64_f64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2d& a) { + return vcvtq_u64_f64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_s32(vmovn_s64(vcvtq_s64_f64(a)), vmovn_s64(vcvtq_s64_f64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_u32(vmovn_u64(vcvtq_u64_f64(a)), vmovn_u64(vcvtq_u64_f64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + const int32x4_t ab_s32 = pcast(a, b); + const int32x4_t cd_s32 = pcast(c, d); + return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + const uint32x4_t ab_u32 = pcast(a, b); + const uint32x4_t cd_u32 = pcast(c, d); + return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const int16x8_t abcd_s16 = pcast(a, b, c, d); + const int16x8_t efgh_s16 = pcast(e, f, g, h); + return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { + // Discard second-half of input. + return vcvt_f64_f32(vget_low_f32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_s8(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_u8(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_s16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_u16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4i& a) { + // Discard second half of input. + return vcvtq_f64_s64(vmovl_s32(vget_low_s32(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4ui& a) { + // Discard second half of input. + return vcvtq_f64_u64(vmovl_u32(vget_low_u32(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { + return vcvtq_f64_s64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { + return vcvtq_f64_u64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return vreinterpretq_f64_s64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return vreinterpretq_f64_u64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return vreinterpretq_s64_f64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return vreinterpretq_u64_f64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return vreinterpretq_f64_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return vreinterpretq_s32_f64(a); +} + +#endif // EIGEN_ARCH_ARM64 + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_NEON_H diff --git a/inst/include/Eigen/src/Core/arch/SSE/Complex.h b/inst/include/Eigen/src/Core/arch/SSE/Complex.h index 4cb1b667..8fe22da4 100644 --- a/inst/include/Eigen/src/Core/arch/SSE/Complex.h +++ b/inst/include/Eigen/src/Core/arch/SSE/Complex.h @@ -19,7 +19,7 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} - __m128 v; + Packet4f v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -40,20 +40,33 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, HasMax = 0, HasSetLinear = 0, - HasBlend = 1 + HasBlend = 1 }; }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet4f as_real; + enum { + size=2, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); @@ -82,10 +95,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } @@ -93,19 +107,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { Packet2cf res; -#if EIGEN_GNUC_AT_MOST(4,2) - // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2 - res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast(&from)); -#elif EIGEN_GNUC_AT_LEAST(4,6) - // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6 - //#pragma GCC diagnostic push - //#pragma GCC diagnostic ignored "-Wuninitialized" - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); - //#pragma GCC diagnostic pop +#ifdef EIGEN_VECTORIZE_SSE3 + res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast(&from))); #else - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&from))); + res.v = _mm_movelh_ps(res.v, res.v); #endif - return Packet2cf(_mm_movelh_ps(res.v,res.v)); + return res; } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } @@ -152,97 +160,26 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = _mm_movehl_ps(first.v, first.v); - first.v = _mm_movelh_ps(first.v, second.v); - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); - #endif - } -}; - -template<> struct conj_helper +EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a, pconj(b)); __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2)))); } -EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) -{ - return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); -} //---------- double ---------- @@ -250,7 +187,7 @@ struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} - __m128d v; + Packet2d v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -271,6 +208,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -280,7 +218,18 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d as_real; + enum { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } @@ -305,10 +254,11 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); } // FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) @@ -340,86 +290,17 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) -{ - return vecs[0]; -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)), mask))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); __m128d s = _mm_mul_pd(b.v,b.v); return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } @@ -439,33 +320,32 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } -template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { - __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); - return Packet2cf(_mm_castpd_ps(result)); +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + __m128 eq = _mm_cmpeq_ps(a.v, b.v); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } -template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex b) +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { - return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast(&b))); + __m128d eq = _mm_cmpeq_pd(a.v, b.v); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } -template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex b) -{ - return pset1(b); +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); } -template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex b) -{ - return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast(&b))); +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); } -template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex b) -{ - return pset1(b); +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); } } // end namespace internal - } // end namespace Eigen #endif // EIGEN_COMPLEX_SSE_H diff --git a/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h b/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h index 7b5f948e..8736d0d6 100644 --- a/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -8,7 +8,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from +/* The sin and cos and functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -20,426 +20,57 @@ namespace Eigen { namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - - - Packet4i emm0; - - Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN - Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); - - /* keep only the fractional part */ - x = _mm_and_ps(x, p4f_inv_mant_mask); - x = _mm_or_ps(x, p4f_half); - - emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), - _mm_and_ps(iszero_mask, p4f_minus_inf)); +Packet4f plog(const Packet4f& _x) { + return plog_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - Packet4f tmp, fx; - Packet4i emm0; +Packet2d plog(const Packet2d& _x) { + return plog_double(_x); +} - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog2(const Packet4f& _x) { + return plog2_float(_x); +} - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d plog2(const Packet2d& _x) { + return plog2_double(_x); +} -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_ps(fx); -#else - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); - /* if greater, substract 1 */ - Packet4f mask = _mm_cmpgt_ps(tmp, fx); - mask = _mm_and_ps(mask, p4f_1); - fx = psub(tmp, mask); -#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog1p(const Packet4f& _x) { + return generic_plog1p(_x); +} - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = _mm_cvttps_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_0x7f); - emm0 = _mm_slli_epi32(emm0, 23); - return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexpm1(const Packet4f& _x) { + return generic_expm1(_x); } + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp(const Packet2d& _x) +Packet4f pexp(const Packet4f& _x) { - Packet2d x = _x; - - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); - - Packet2d tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_pd(fx); -#else - emm0 = _mm_cvttpd_epi32(fx); - tmp = _mm_cvtepi32_pd(emm0); - /* if greater, substract 1 */ - Packet2d mask = _mm_cmpgt_pd(tmp, fx); - mask = _mm_and_pd(mask, p2d_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = _mm_cvttpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); + return pexp_float(_x); } -/* evaluation of 4 sines at onces, using SSE2 intrinsics. - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& x) +{ + return pexp_double(x); +} template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, sign_bit, y; - - Packet4i emm0, emm2; - sign_bit = x; - /* take the absolute value */ - x = pabs(x); - - /* take the modulo */ - - /* extract the sign bit (upper one) */ - sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* store the integer part of y in mm0 */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - /* get the swap sign flag */ - emm0 = _mm_and_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, y; - Packet4i emm0, emm2; - - x = pabs(x); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* get the integer part of y */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - - emm2 = _mm_sub_epi32(emm2, p4i_2); - - /* get the swap sign flag */ - emm0 = _mm_andnot_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask */ - emm2 = _mm_and_si128(emm2, p4i_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - - Packet4f sign_bit = _mm_castsi128_ps(emm0); - Packet4f poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = pmul(y, p4f_minus_cephes_DP1); - xmm2 = pmul(y, p4f_minus_cephes_DP2); - xmm3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, xmm1); - x = padd(x, xmm2); - x = padd(x, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = p4f_coscof_p0; - Packet4f z = pmul(x,x); - - y = pmadd(y,z,p4f_coscof_p1); - y = pmadd(y,z,p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - Packet4f tmp = _mm_mul_ps(z, p4f_half); - y = psub(y, tmp); - y = padd(y, p4f_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmadd(y2, x, x); - - /* select the correct result from the two polynoms */ - y2 = _mm_and_ps(poly_mask, y2); - y = _mm_andnot_ps(poly_mask, y); - y = _mm_or_ps(y,y2); - - /* update the sign */ - return _mm_xor_ps(y, sign_bit); + return pcos_float(_x); } #if EIGEN_FAST_MATH @@ -455,17 +86,17 @@ Packet4f pcos(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& _x) { - Packet4f half = pmul(_x, pset1(.5f)); - Packet4f denormal_mask = _mm_and_ps( - _mm_cmpge_ps(_x, _mm_setzero_ps()), - _mm_cmplt_ps(_x, pset1((std::numeric_limits::min)()))); + Packet4f minus_half_x = pmul(_x, pset1(-0.5f)); + Packet4f denormal_mask = pandnot( + pcmp_lt(_x, pset1((std::numeric_limits::min)())), + pcmp_lt(_x, pzero(_x))); // Compute approximate reciprocal sqrt. Packet4f x = _mm_rsqrt_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); // Flush results for denormals to zero. - return _mm_andnot_ps(denormal_mask, pmul(_x,x)); + return pandnot(pmul(_x,x), denormal_mask); } #else @@ -478,41 +109,48 @@ Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16b psqrt(const Packet16b& x) { return x; } + #if EIGEN_FAST_MATH template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); Packet4f neg_half = pmul(_x, p4f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); - Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); - Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); - Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), - _mm_and_ps(zero_mask, p4f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm_or_ps(x, infs_and_nans); + // Identity infinite, zero, negative and denormal arguments. + Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min); + Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf); + Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask); + + // Compute an approximate result using the rsqrt intrinsic. + Packet4f y_approx = _mm_rsqrt_ps(_x); + + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet4f y_newton = pmul( + y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five)); + + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { - // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. + // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation. return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); } @@ -520,7 +158,6 @@ Packet4f prsqrt(const Packet4f& x) { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); } @@ -548,7 +185,7 @@ double sqrt(const double &x) { #if EIGEN_COMP_GNUC_STRICT // This works around a GCC bug generating poor code for _mm_sqrt_pd - // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b + // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970 return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); diff --git a/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h b/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h index 60e2517e..db102c73 100755 --- a/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h @@ -18,13 +18,15 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +// 32 bits => 8 registers +// 64 bits => 16 registers #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif @@ -34,47 +36,75 @@ namespace internal { // One solution is to increase ABI version using -fabi-version=4 (or greater). // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper // structure: -template -struct eigen_packet_wrapper -{ - EIGEN_ALWAYS_INLINE operator T&() { return m_val; } - EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } - EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } - - T m_val; -}; typedef eigen_packet_wrapper<__m128> Packet4f; -typedef eigen_packet_wrapper<__m128i> Packet4i; typedef eigen_packet_wrapper<__m128d> Packet2d; #else typedef __m128 Packet4f; -typedef __m128i Packet4i; typedef __m128d Packet2d; #endif +typedef eigen_packet_wrapper<__m128i, 0> Packet4i; +typedef eigen_packet_wrapper<__m128i, 1> Packet16b; + template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template +struct shuffle_mask{ + enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) }; +}; +// TODO: change the implementation of all swizzle* ops from macro to template, #define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) + Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask::mask)))) #define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) + Packet4i(_mm_shuffle_epi32( v, (shuffle_mask::mask))) #define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - + Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask)))) + #define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) + Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask::mask))) #define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) + Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask::mask))))) + +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movelh_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movehl_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpacklo_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpackhi_ps(a,b)); +} +#define vec4f_duplane(a,p) \ + vec4f_swizzle2(a,a,p,p,p,p) + +#define vec2d_swizzle2(a,b,mask) \ + Packet2d(_mm_shuffle_pd(a,b,mask)) + +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpacklo_pd(a,b)); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpackhi_pd(a,b)); +} +#define vec2d_duplane(a,p) \ + vec2d_swizzle2(a,a,(p<<1)|p) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -83,7 +113,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet2d p2d_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) + const Packet4f p4f_##NAME = pset1frombits(X) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -92,36 +122,41 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going // to leverage AVX instructions. #ifndef EIGEN_VECTORIZE_AVX -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4, + size = 4, HasHalfPacket = 0, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, + HasCmp = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasNdtri = 1, + HasExp = 1, + HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasBlend = 1 - + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 1, + HasCeil = 1, + HasFloor = 1, #ifdef EIGEN_VECTORIZE_SSE4_1 - , HasRound = 1, - HasFloor = 1, - HasCeil = 1 #endif + HasRint = 1 }; }; -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet2d type; typedef Packet2d half; enum { @@ -130,18 +165,19 @@ template<> struct packet_traits : default_packet_traits size=2, HasHalfPacket = 0, + HasCmp = 1, HasDiv = 1, + HasLog = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1 - + HasBlend = 1, + HasFloor = 1, + HasCeil = 1, #ifdef EIGEN_VECTORIZE_SSE4_1 - , HasRound = 1, - HasFloor = 1, - HasCeil = 1 #endif + HasRint = 1 }; }; #endif @@ -154,13 +190,56 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=4, + HasShift = 1, HasBlend = 1 }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet16b type; + typedef Packet16b half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + HasHalfPacket = 0, + size=16, + + HasAdd = 1, + HasSub = 1, + HasShift = 0, + HasMul = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSqrt = 1 + }; +}; + +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef int type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef bool type; + typedef Packet16b half; + enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; #ifndef EIGEN_VECTORIZE_AVX template<> struct scalar_div_cost { enum { value = 7 }; }; @@ -179,6 +258,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +template<> EIGEN_STRONG_INLINE Packet16b pset1(const bool& from) { return _mm_set1_epi8(static_cast(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); } +template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); } +template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); } + +template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); } // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) @@ -190,7 +281,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); } #endif - + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } @@ -199,9 +290,34 @@ template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b padd(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b psub(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_ps(a,b); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0)); + return padd(a, pxor(mask, b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_pd(a,b); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); + return padd(a, pxor(mask, b)); +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { @@ -218,6 +334,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } +template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a) +{ + return psub(pset1(false), a); +} + template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -240,18 +361,126 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet16b pmul(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { + return _mm_blendv_ps(b,a,mask); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) { + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask))); +} + +template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); } + +template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) { + return _mm_blendv_epi8(b,a,mask); +} +#else +template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) { + Packet16b a_part = _mm_and_si128(mask, a); + Packet16b b_part = _mm_andnot_si128(mask, b); + return _mm_or_si128(a_part, b_part); +} +#endif + +template<> EIGEN_STRONG_INLINE Packet4i ptrue(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet16b ptrue(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +ptrue(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +ptrue(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} + + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pand(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b por(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pxor(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } + +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet4f res = b; + asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet2d res = b; + asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -263,8 +492,45 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet4f res = b; + asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet2d res = b; + asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -276,36 +542,180 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const #endif } +template +EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) { + // In this implementation, we take advantage of the fact that pmin/pmax for SSE + // always return a if either a or b is NaN. + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet m = op(a, b); + return pselect(not_nan_mask_a, m, b); +} + +template +EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) { + // In this implementation, we take advantage of the fact that pmin/pmax for SSE + // always return a if either a or b is NaN. + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet m = op(b, a); + return pselect(not_nan_mask_a, m, a); +} + +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_nan(a, b, pmax); +} + +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right (const Packet4i& a) { return _mm_srli_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left (const Packet4i& a) { return _mm_slli_epi32(a,N); } + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) +{ + #ifdef EIGEN_VECTORIZE_SSSE3 + return _mm_abs_epi32(a); + #else + Packet4i aux = _mm_srai_epi32(a,31); + return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); + #endif +} + #ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round. + const Packet4f mask = pset1frombits(0x80000000u); + const Packet4f prev0dot5 = pset1frombits(0x3EFFFFFFu); + return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull)); + const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); + return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } -#endif +#else +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. + const Packet2d limit = pset1(static_cast(1ull<<52)); + const Packet2d abs_a = pabs(a); + Packet2d r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If greater, subtract one. + Packet4f mask = _mm_cmpgt_ps(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet2d tmp = print(a); + // If greater, subtract one. + Packet2d mask = _mm_cmpgt_pd(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If smaller, add one. + Packet4f mask = _mm_cmplt_ps(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet2d tmp = print(a); + // If smaller, add one. + Packet2d mask = _mm_cmplt_pd(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} +#endif template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet16b pload(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if EIGEN_COMP_MSVC template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -340,6 +750,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet16b ploadu(const bool* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_si128(reinterpret_cast(from)); +} template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) @@ -355,13 +769,32 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec4i_swizzle1(tmp, 0, 0, 1, 1); } +// Loads 8 bools from memory and returns the packet +// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7} +template<> EIGEN_STRONG_INLINE Packet16b ploaddup(const bool* from) +{ + __m128i tmp = _mm_castpd_si128(pload1(reinterpret_cast(from))); + return _mm_unpacklo_epi8(tmp, tmp); +} + +// Loads 4 bools from memory and returns the packet +// {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3} +template<> EIGEN_STRONG_INLINE Packet16b +ploadquad(const bool* from) { + __m128i tmp = _mm_castps_si128(pload1(reinterpret_cast(from))); + tmp = _mm_unpacklo_epi8(tmp, tmp); + return _mm_unpacklo_epi16(tmp, tmp); +} + template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstore(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { @@ -374,7 +807,15 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); - } +} + +template<> EIGEN_DEVICE_FUNC inline Packet16b pgather(const bool* from, Index stride) +{ + return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride], + from[11*stride], from[10*stride], from[9*stride], from[8*stride], + from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { @@ -395,6 +836,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); } +template<> EIGEN_DEVICE_FUNC inline void pscatter(bool* to, const Packet16b& from, Index stride) +{ + to[4*stride*0] = _mm_cvtsi128_si32(from); + to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} + // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) @@ -409,7 +858,7 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } -#if EIGEN_COMP_PGI +#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 typedef const void * SsePrefetchPtrType; #else typedef const char * SsePrefetchPtrType; @@ -437,32 +886,62 @@ template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { retu template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } #endif +template<> EIGEN_STRONG_INLINE bool pfirst(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast(x & 1); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ return _mm_shuffle_ps(a,a,0x1B); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ return _mm_shuffle_pd(a,a,0x1); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) { +#ifdef EIGEN_VECTORIZE_SSSE3 + __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm_shuffle_epi8(a, mask); +#else + Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)); + tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8)); +#endif +} -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); - return _mm_and_ps(a,mask); +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_generic(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); - return _mm_and_pd(a,mask); + +// Extract exponent without existence of Packet2l. +template<> +EIGEN_STRONG_INLINE +Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) { + const Packet2d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52); + return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) -{ - #ifdef EIGEN_VECTORIZE_SSSE3 - return _mm_abs_epi32(a); - #else - Packet4i aux = _mm_srai_epi32(a,31); - return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); - #endif + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) { + return pfrexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_generic(a,exponent); +} + +// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well +// supported by SSE, and has more range than is needed for exponents. +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet2d max_exponent = pset1(2099.0); + const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent); + + // Convert e to integer and swizzle to low-order bits. + const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3); + + // Split 2^e into four factors and multiply: + const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023); + Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4) + Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b + Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(ei, b), b), b); // e - 3b + c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b) + out = pmul(out, c); // a * 2^e + return out; } // with AVX, the default implementations based on pload1 are faster @@ -505,38 +984,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); } -#ifdef EIGEN_VECTORIZE_SSE3 -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_hadd_pd(vecs[0], vecs[1]); -} - -#else -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - Packet4f tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); - tmp0 = _mm_add_ps(tmp0, tmp1); - tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); - tmp1 = _mm_add_ps(tmp1, tmp2); - tmp2 = _mm_movehl_ps(tmp1, tmp0); - tmp0 = _mm_movelh_ps(tmp0, tmp1); - return _mm_add_ps(tmp0, tmp2); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); -} -#endif // SSE3 - template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures @@ -562,38 +1009,28 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) } #ifdef EIGEN_VECTORIZE_SSSE3 -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp0 = _mm_hadd_epi32(a,a); return pfirst(_mm_hadd_epi32(tmp0,tmp0)); } + #else template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } +#endif -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - Packet4i tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - tmp0 = _mm_add_epi32(tmp0, tmp1); - tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - tmp1 = _mm_add_epi32(tmp1, tmp2); - tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); - tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); - return _mm_add_epi32(tmp0, tmp2); +template<> EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { + Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a)); + return (pfirst(tmp) != 0) || (pfirst(_mm_shuffle_epi32(tmp, 1)) != 0); } -#endif + // Other reduction functions: + // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { @@ -611,7 +1048,13 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) // TODO try to call _mm_mul_epu32 directly EIGEN_ALIGN16 int aux[4]; pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]);; + return (aux[0] * aux[1]) * (aux[2] * aux[3]); +} + +template<> EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) { + Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a)); + return ((pfirst(tmp) == 0x01010101) && + (pfirst(_mm_shuffle_epi32(tmp, 1)) == 0x01010101)); } // min @@ -666,113 +1109,16 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } -#if EIGEN_COMP_GNUC -// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -// { -// Packet4f res = b; -// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); -// return res; -// } -// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) // { -// Packet4i res = a; -// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); -// return res; +// return _mm_movemask_ps(x) == 0xF; // } -#endif - -#ifdef EIGEN_VECTORIZE_SSSE3 -// SSSE3 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset!=0) - first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); - } -}; -template -struct palign_impl +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset!=0) - first = _mm_alignr_epi8(second,first, Offset*4); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); - } -}; -#else -// SSE2 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset==1) - { - first = _mm_move_ss(first,second); - first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); - } - else if (Offset==2) - { - first = _mm_movehl_ps(first,first); - first = _mm_movelh_ps(first,second); - } - else if (Offset==3) - { - first = _mm_move_ss(first,second); - first = _mm_shuffle_ps(first,second,0x93); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset==1) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_shuffle_epi32(first,0x39); - } - else if (Offset==2) - { - first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); - first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - } - else if (Offset==3) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - { - first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); - first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); - } - } -}; -#endif + return _mm_movemask_ps(x) != 0x0; +} EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -799,6 +1145,100 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); + __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = _mm_unpacklo_epi16(T0, T2); + kernel.packet[1] = _mm_unpackhi_epi16(T0, T2); + kernel.packet[2] = _mm_unpacklo_epi16(T1, T3); + kernel.packet[3] = _mm_unpackhi_epi16(T1, T3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + // If we number the elements in the input thus: + // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f} + // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f} + // ... + // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff}, + // + // the desired output is: + // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0} + // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1} + // ... + // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff}, + __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f + __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37 + __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f + __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57 + __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a + __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]); + __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]); + __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]); + __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]); + __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]); + __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]); + __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]); + __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]); + __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]); + __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]); + + __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34 + __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ... + __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ... + __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ... + __m128i s6 = _mm_unpacklo_epi16(t5, t7); + __m128i s7 = _mm_unpackhi_epi16(t5, t7); + __m128i s8 = _mm_unpacklo_epi16(t8, ta); + __m128i s9 = _mm_unpackhi_epi16(t8, ta); + __m128i sa = _mm_unpacklo_epi16(t9, tb); + __m128i sb = _mm_unpackhi_epi16(t9, tb); + __m128i sc = _mm_unpacklo_epi16(tc, te); + __m128i sd = _mm_unpackhi_epi16(tc, te); + __m128i se = _mm_unpacklo_epi16(td, tf); + __m128i sf = _mm_unpackhi_epi16(td, tf); + + __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + __m128i u2 = _mm_unpacklo_epi32(s1, s5); + __m128i u3 = _mm_unpackhi_epi32(s1, s5); + __m128i u4 = _mm_unpacklo_epi32(s2, s6); + __m128i u5 = _mm_unpackhi_epi32(s2, s6); + __m128i u6 = _mm_unpacklo_epi32(s3, s7); + __m128i u7 = _mm_unpackhi_epi32(s3, s7); + __m128i u8 = _mm_unpacklo_epi32(s8, sc); + __m128i u9 = _mm_unpackhi_epi32(s8, sc); + __m128i ua = _mm_unpacklo_epi32(s9, sd); + __m128i ub = _mm_unpackhi_epi32(s9, sd); + __m128i uc = _mm_unpacklo_epi32(sa, se); + __m128i ud = _mm_unpackhi_epi32(sa, se); + __m128i ue = _mm_unpacklo_epi32(sb, sf); + __m128i uf = _mm_unpackhi_epi32(sb, sf); + + kernel.packet[0] = _mm_unpacklo_epi64(u0, u8); + kernel.packet[1] = _mm_unpackhi_epi64(u0, u8); + kernel.packet[2] = _mm_unpacklo_epi64(u1, u9); + kernel.packet[3] = _mm_unpackhi_epi64(u1, u9); + kernel.packet[4] = _mm_unpacklo_epi64(u2, ua); + kernel.packet[5] = _mm_unpackhi_epi64(u2, ua); + kernel.packet[6] = _mm_unpacklo_epi64(u3, ub); + kernel.packet[7] = _mm_unpackhi_epi64(u3, ub); + kernel.packet[8] = _mm_unpacklo_epi64(u4, uc); + kernel.packet[9] = _mm_unpackhi_epi64(u4, uc); + kernel.packet[10] = _mm_unpacklo_epi64(u5, ud); + kernel.packet[11] = _mm_unpackhi_epi64(u5, ud); + kernel.packet[12] = _mm_unpacklo_epi64(u6, ue); + kernel.packet[13] = _mm_unpackhi_epi64(u6, ue); + kernel.packet[14] = _mm_unpacklo_epi64(u7, uf); + kernel.packet[15] = _mm_unpackhi_epi64(u7, uf); +} + template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { const __m128i zero = _mm_setzero_si128(); const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); @@ -830,59 +1270,229 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons #endif } -template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),1); -#else - return _mm_move_ss(a, _mm_load_ss(&b)); +// Scalar path for pmadd with FMA to ensure consistency with vectorized path. +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { + return ::fmaf(a,b,c); +} +template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { + return ::fma(a,b,c); +} #endif + + +// Packet math for Eigen::half +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; } -template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),1); -#else - return _mm_move_sd(a, _mm_load_sd(&b)); -#endif +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); } -template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),(1<<3)); -#else - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); - return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); -#endif +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha - hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha / hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; } -template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),(1<<1)); -#else - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); - return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); -#endif + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; } -// Scalar path for pmadd with FMA to ensure consistency with vectorized path. -#ifdef __FMA__ -template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { - return ::fmaf(a,b,c); +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast(a); + to[stride*1].x = static_cast(a >> 16); + to[stride*2].x = static_cast(a >> 32); + to[stride*3].x = static_cast(a >> 48); } -template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { - return ::fma(a,b,c); + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); } + #endif + } // end namespace internal } // end namespace Eigen -#if EIGEN_COMP_PGI +#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 // PGI++ does not define the following intrinsics in C++ mode. static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } diff --git a/inst/include/Eigen/src/Core/arch/SSE/TypeCasting.h b/inst/include/Eigen/src/Core/arch/SSE/TypeCasting.h index c6ca8c71..d2a0037e 100644 --- a/inst/include/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/inst/include/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -69,6 +69,71 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f return _mm_cvtps_pd(a); } +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return _mm_castps_si128(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return _mm_castsi128_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return _mm_castsi128_pd(a); +} + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return _mm_castpd_si128(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); + float f1 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 16)); + float f2 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 32)); + float f3 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 48)); + float f4 = static_cast(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/SVE/MathFunctions.h b/inst/include/Eigen/src/Core/arch/SVE/MathFunctions.h new file mode 100644 index 00000000..b139ea2e --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SVE/MathFunctions.h @@ -0,0 +1,44 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_SVE_H +#define EIGEN_MATH_FUNCTIONS_SVE_H + +namespace Eigen { +namespace internal { + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pexp(const PacketXf& x) { + return pexp_float(x); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf plog(const PacketXf& x) { + return plog_float(x); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf psin(const PacketXf& x) { + return psin_float(x); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pcos(const PacketXf& x) { + return pcos_float(x); +} + +// Hyperbolic Tangent function. +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf ptanh(const PacketXf& x) { + return internal::generic_fast_tanh_float(x); +} +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_SVE_H diff --git a/inst/include/Eigen/src/Core/arch/SVE/PacketMath.h b/inst/include/Eigen/src/Core/arch/SVE/PacketMath.h new file mode 100644 index 00000000..9060b372 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SVE/PacketMath.h @@ -0,0 +1,752 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_SVE_H +#define EIGEN_PACKET_MATH_SVE_H + +namespace Eigen +{ +namespace internal +{ +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 + +template +struct sve_packet_size_selector { + enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) }; +}; + +/********************************* int32 **************************************/ +typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXi type; + typedef PacketXi half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = sve_packet_size_selector::size, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 // Not implemented in SVE + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; // Half not yet implemented + enum { + size = sve_packet_size_selector::size, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) +{ + svprfw(svptrue_b32(), addr, SV_PLDL1KEEP); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) +{ + return svdup_n_s32(from); +} + +template <> +EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) +{ + numext::int32_t c[packet_traits::size]; + for (int i = 0; i < packet_traits::size; i++) c[i] = i; + return svadd_s32_z(svptrue_b32(), pset1(a), svld1_s32(svptrue_b32(), c)); +} + +template <> +EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) +{ + return svadd_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) +{ + return svsub_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) +{ + return svneg_s32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) +{ + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) +{ + return svmul_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) +{ + return svdiv_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) +{ + return svmla_s32_z(svptrue_b32(), c, a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) +{ + return svmin_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) +{ + return svmax_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) +{ + return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) +{ + return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) +{ + return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) +{ + return svdup_n_s32_z(svptrue_b32(), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) +{ + return svdup_n_s32_z(svptrue_b32(), 0); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) +{ + return svand_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) +{ + return svorr_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) +{ + return sveor_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) +{ + return svbic_s32_z(svptrue_b32(), a, b); +} + +template +EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) +{ + return svasrd_n_s32_z(svptrue_b32(), a, N); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) +{ + return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N))); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) +{ + return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N)); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + return svld1_gather_u32index_s32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...} + return svld1_gather_u32index_s32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) +{ + EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + return svld1_gather_s32index_s32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) +{ + // svlasta returns the first element if all predicate bits are 0 + return svlasta_s32(svpfalse_b(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) +{ + return svrev_s32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) +{ + return svabs_s32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) +{ + return static_cast(svaddv_s32(svptrue_b32(), a)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) +{ + EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT); + + // Multiply the vector by its reverse + svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a)); + svint32_t half_prod; + + // Extract the high half of the vector. Depending on the VL more reductions need to be done + if (EIGEN_ARM64_SVE_VL >= 2048) { + half_prod = svtbl_s32(prod, svindex_u32(32, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 1024) { + half_prod = svtbl_s32(prod, svindex_u32(16, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 512) { + half_prod = svtbl_s32(prod, svindex_u32(8, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 256) { + half_prod = svtbl_s32(prod, svindex_u32(4, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + // Last reduction + half_prod = svtbl_s32(prod, svindex_u32(2, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) +{ + return svminv_s32(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) +{ + return svmaxv_s32(svptrue_b32(), a); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + int buffer[packet_traits::size * N] = {0}; + int i = 0; + + PacketXi stride_index = svindex_s32(0, N); + + for (i = 0; i < N; i++) { + svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits::size); + } +} + +/********************************* float32 ************************************/ + +typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = sve_packet_size_selector::size, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, // Not implemented in SVE + + HasDiv = 1, + HasFloor = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; // Half not yet implemented + typedef PacketXi integer_packet; + + enum { + size = sve_packet_size_selector::size, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf pset1(const float& from) +{ + return svdup_n_f32(from); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) +{ + float c[packet_traits::size]; + for (int i = 0; i < packet_traits::size; i++) c[i] = i; + return svadd_f32_z(svptrue_b32(), pset1(a), svld1_f32(svptrue_b32(), c)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) +{ + return svadd_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) +{ + return svsub_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) +{ + return svneg_f32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) +{ + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) +{ + return svmul_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) +{ + return svdiv_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) +{ + return svmla_f32_z(svptrue_b32(), c, a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) +{ + return svmin_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) +{ + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) +{ + return svminnm_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) +{ + return svmax_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) +{ + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) +{ + return svmaxnm_f32_z(svptrue_b32(), a, b); +} + +// Float comparisons in SVE return svbool (predicate). Use svdup to set active +// lanes to 1 (0xffffffffu) and inactive lanes to 0. +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu)); +} + +// Do a predicate inverse (svnot_b_z) on the predicate resulted from the +// greater/equal comparison (svcmpge_f32). Then fill a float vector with the +// active elements. +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) +{ + return svrintm_f32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu)); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pload(const float* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + return svld1_gather_u32index_f32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...} + return svld1_gather_u32index_f32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) +{ + EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + return svld1_gather_s32index_f32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) +{ + // svlasta returns the first element if all predicate bits are 0 + return svlasta_f32(svpfalse_b(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) +{ + return svrev_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) +{ + return svabs_f32_z(svptrue_b32(), a); +} + +// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for +// all vector extensions and the generic version. +template <> +EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) +{ + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) +{ + return svaddv_f32(svptrue_b32(), a); +} + +// Other reduction functions: +// mul +// Only works for SVE Vls multiple of 128 +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) +{ + EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT); + // Multiply the vector by its reverse + svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a)); + svfloat32_t half_prod; + + // Extract the high half of the vector. Depending on the VL more reductions need to be done + if (EIGEN_ARM64_SVE_VL >= 2048) { + half_prod = svtbl_f32(prod, svindex_u32(32, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 1024) { + half_prod = svtbl_f32(prod, svindex_u32(16, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 512) { + half_prod = svtbl_f32(prod, svindex_u32(8, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 256) { + half_prod = svtbl_f32(prod, svindex_u32(4, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + // Last reduction + half_prod = svtbl_f32(prod, svindex_u32(2, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) +{ + return svminv_f32(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) +{ + return svmaxv_f32(svptrue_b32(), a); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ + float buffer[packet_traits::size * N] = {0}; + int i = 0; + + PacketXi stride_index = svindex_s32(0, N); + + for (i = 0; i < N; i++) { + svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits::size); + } +} + +template<> +EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) +{ + return pldexp_generic(a, exponent); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_SVE_H diff --git a/inst/include/Eigen/src/Core/arch/SVE/TypeCasting.h b/inst/include/Eigen/src/Core/arch/SVE/TypeCasting.h new file mode 100644 index 00000000..7ba5d9cd --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SVE/TypeCasting.h @@ -0,0 +1,49 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_SVE_H +#define EIGEN_TYPE_CASTING_SVE_H + +namespace Eigen { +namespace internal { + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { + return svcvt_f32_s32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { + return svcvt_s32_f32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preinterpret(const PacketXi& a) { + return svreinterpret_f32_s32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) { + return svreinterpret_s32_f32(a); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SVE_H diff --git a/inst/include/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/inst/include/Eigen/src/Core/arch/SYCL/InteropHeaders.h new file mode 100644 index 00000000..10856ff5 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * InteropHeaders.h + * + * \brief: + * InteropHeaders + * + *****************************************************************/ + +#ifndef EIGEN_INTEROP_HEADERS_SYCL_H +#define EIGEN_INTEROP_HEADERS_SYCL_H + +namespace Eigen { + +#if !defined(EIGEN_DONT_VECTORIZE_SYCL) + +namespace internal { + +template +struct sycl_packet_traits : default_packet_traits { + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = lengths, + HasHalfPacket = 0, + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasSin = 1, + HasCos = 1, + HasTan = 1, + HasASin = 1, + HasACos = 1, + HasATan = 1, + HasSinh = 1, + HasCosh = 1, + HasTanh = 1, + HasLGamma = 0, + HasDiGamma = 0, + HasZeta = 0, + HasPolygamma = 0, + HasErf = 0, + HasErfc = 0, + HasNdtri = 0, + HasIGamma = 0, + HasIGammac = 0, + HasBetaInc = 0, + HasBlend = has_blend, + // This flag is used to indicate whether packet comparison is supported. + // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true. + HasCmp = 1, + HasMax = 1, + HasMin = 1, + HasMul = 1, + HasAdd = 1, + HasFloor = 1, + HasRound = 1, + HasRint = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasCeil = 1, + }; +}; + +#ifdef SYCL_DEVICE_ONLY +#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \ + template <> \ + struct packet_traits \ + : sycl_packet_traits { \ + typedef packet_type type; \ + typedef packet_type half; \ + }; + +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2) +#undef SYCL_PACKET_TRAITS + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#define SYCL_ARITHMETIC(packet_type) \ + template <> \ + struct is_arithmetic { \ + enum { value = true }; \ + }; +SYCL_ARITHMETIC(cl::sycl::cl_float4) +SYCL_ARITHMETIC(cl::sycl::cl_double2) +#undef SYCL_ARITHMETIC + +#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths) \ + template <> \ + struct unpacket_traits { \ + typedef unpacket_type type; \ + enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \ + typedef packet_type half; \ + }; +SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) +SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2) + +#undef SYCL_UNPACKET_TRAITS +#endif + +} // end namespace internal + +#endif + +namespace TensorSycl { +namespace internal { + +template +struct PacketWrapper; +// This function should never get called on the device +#ifndef SYCL_DEVICE_ONLY +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) { + eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE"); + abort(); + } + EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, + Scalar) { + return ::Eigen::internal::template plset(in); + } + EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) { + eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE"); + abort(); + } +}; + +#elif defined(SYCL_DEVICE_ONLY) +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) { + switch (index) { + case 0: + return in.x(); + case 1: + return in.y(); + case 2: + return in.z(); + case 3: + return in.w(); + default: + //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here. + // The code will never reach here + __builtin_unreachable(); + } + __builtin_unreachable(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type( + Scalar in, Scalar other) { + return PacketReturnType(in, other, other, other); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { + lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]); + } +}; + +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) { + return in; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, + Scalar) { + return PacketReturnType(in); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { + lhs = rhs[0]; + } +}; + +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) { + switch (index) { + case 0: + return in.x(); + case 1: + return in.y(); + default: + //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here. + // The code will never reach here + __builtin_unreachable(); + } + __builtin_unreachable(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type( + Scalar in, Scalar other) { + return PacketReturnType(in, other); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { + lhs = PacketReturnType(rhs[0], rhs[1]); + } +}; + +#endif + +} // end namespace internal +} // end namespace TensorSycl +} // end namespace Eigen + +#endif // EIGEN_INTEROP_HEADERS_SYCL_H diff --git a/inst/include/Eigen/src/Core/arch/SYCL/MathFunctions.h b/inst/include/Eigen/src/Core/arch/SYCL/MathFunctions.h new file mode 100644 index 00000000..2ab0f2a7 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SYCL/MathFunctions.h @@ -0,0 +1,301 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * MathFunctions.h + * + * \brief: + * MathFunctions + * + *****************************************************************/ + +#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H +#define EIGEN_MATH_FUNCTIONS_SYCL_H +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(SYCL_DEVICE_ONLY) +#define SYCL_PLOG(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog( \ + const packet_type& a) { \ + return cl::sycl::log(a); \ + } + +SYCL_PLOG(cl::sycl::cl_float4) +SYCL_PLOG(cl::sycl::cl_double2) +#undef SYCL_PLOG + +#define SYCL_PLOG1P(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p( \ + const packet_type& a) { \ + return cl::sycl::log1p(a); \ + } + +SYCL_PLOG1P(cl::sycl::cl_float4) +SYCL_PLOG1P(cl::sycl::cl_double2) +#undef SYCL_PLOG1P + +#define SYCL_PLOG10(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10( \ + const packet_type& a) { \ + return cl::sycl::log10(a); \ + } + +SYCL_PLOG10(cl::sycl::cl_float4) +SYCL_PLOG10(cl::sycl::cl_double2) +#undef SYCL_PLOG10 + +#define SYCL_PEXP(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp( \ + const packet_type& a) { \ + return cl::sycl::exp(a); \ + } + +SYCL_PEXP(cl::sycl::cl_float4) +SYCL_PEXP(cl::sycl::cl_float) +SYCL_PEXP(cl::sycl::cl_double2) +#undef SYCL_PEXP + +#define SYCL_PEXPM1(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1( \ + const packet_type& a) { \ + return cl::sycl::expm1(a); \ + } + +SYCL_PEXPM1(cl::sycl::cl_float4) +SYCL_PEXPM1(cl::sycl::cl_double2) +#undef SYCL_PEXPM1 + +#define SYCL_PSQRT(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt( \ + const packet_type& a) { \ + return cl::sycl::sqrt(a); \ + } + +SYCL_PSQRT(cl::sycl::cl_float4) +SYCL_PSQRT(cl::sycl::cl_double2) +#undef SYCL_PSQRT + +#define SYCL_PRSQRT(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt( \ + const packet_type& a) { \ + return cl::sycl::rsqrt(a); \ + } + +SYCL_PRSQRT(cl::sycl::cl_float4) +SYCL_PRSQRT(cl::sycl::cl_double2) +#undef SYCL_PRSQRT + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSIN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin( \ + const packet_type& a) { \ + return cl::sycl::sin(a); \ + } + +SYCL_PSIN(cl::sycl::cl_float4) +SYCL_PSIN(cl::sycl::cl_double2) +#undef SYCL_PSIN + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOS(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos( \ + const packet_type& a) { \ + return cl::sycl::cos(a); \ + } + +SYCL_PCOS(cl::sycl::cl_float4) +SYCL_PCOS(cl::sycl::cl_double2) +#undef SYCL_PCOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTAN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan( \ + const packet_type& a) { \ + return cl::sycl::tan(a); \ + } + +SYCL_PTAN(cl::sycl::cl_float4) +SYCL_PTAN(cl::sycl::cl_double2) +#undef SYCL_PTAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PASIN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin( \ + const packet_type& a) { \ + return cl::sycl::asin(a); \ + } + +SYCL_PASIN(cl::sycl::cl_float4) +SYCL_PASIN(cl::sycl::cl_double2) +#undef SYCL_PASIN + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PACOS(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos( \ + const packet_type& a) { \ + return cl::sycl::acos(a); \ + } + +SYCL_PACOS(cl::sycl::cl_float4) +SYCL_PACOS(cl::sycl::cl_double2) +#undef SYCL_PACOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PATAN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan( \ + const packet_type& a) { \ + return cl::sycl::atan(a); \ + } + +SYCL_PATAN(cl::sycl::cl_float4) +SYCL_PATAN(cl::sycl::cl_double2) +#undef SYCL_PATAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSINH(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh( \ + const packet_type& a) { \ + return cl::sycl::sinh(a); \ + } + +SYCL_PSINH(cl::sycl::cl_float4) +SYCL_PSINH(cl::sycl::cl_double2) +#undef SYCL_PSINH + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOSH(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh( \ + const packet_type& a) { \ + return cl::sycl::cosh(a); \ + } + +SYCL_PCOSH(cl::sycl::cl_float4) +SYCL_PCOSH(cl::sycl::cl_double2) +#undef SYCL_PCOSH + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTANH(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh( \ + const packet_type& a) { \ + return cl::sycl::tanh(a); \ + } + +SYCL_PTANH(cl::sycl::cl_float4) +SYCL_PTANH(cl::sycl::cl_double2) +#undef SYCL_PTANH + +#define SYCL_PCEIL(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil( \ + const packet_type& a) { \ + return cl::sycl::ceil(a); \ + } + +SYCL_PCEIL(cl::sycl::cl_float4) +SYCL_PCEIL(cl::sycl::cl_double2) +#undef SYCL_PCEIL + +#define SYCL_PROUND(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround( \ + const packet_type& a) { \ + return cl::sycl::round(a); \ + } + +SYCL_PROUND(cl::sycl::cl_float4) +SYCL_PROUND(cl::sycl::cl_double2) +#undef SYCL_PROUND + +#define SYCL_PRINT(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print( \ + const packet_type& a) { \ + return cl::sycl::rint(a); \ + } + +SYCL_PRINT(cl::sycl::cl_float4) +SYCL_PRINT(cl::sycl::cl_double2) +#undef SYCL_PRINT + +#define SYCL_FLOOR(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor( \ + const packet_type& a) { \ + return cl::sycl::floor(a); \ + } + +SYCL_FLOOR(cl::sycl::cl_float4) +SYCL_FLOOR(cl::sycl::cl_double2) +#undef SYCL_FLOOR + +#define SYCL_PMIN(packet_type, expr) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin( \ + const packet_type& a, const packet_type& b) { \ + return expr; \ + } + +SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b)) +SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) +#undef SYCL_PMIN + +#define SYCL_PMAX(packet_type, expr) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax( \ + const packet_type& a, const packet_type& b) { \ + return expr; \ + } + +SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b)) +SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) +#undef SYCL_PMAX + +#define SYCL_PLDEXP(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp( \ + const packet_type& a, const packet_type& exponent) { \ + return cl::sycl::ldexp( \ + a, exponent.template convert()); \ + } + +SYCL_PLDEXP(cl::sycl::cl_float4) +SYCL_PLDEXP(cl::sycl::cl_double2) +#undef SYCL_PLDEXP + +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_SYCL_H diff --git a/inst/include/Eigen/src/Core/arch/SYCL/PacketMath.h b/inst/include/Eigen/src/Core/arch/SYCL/PacketMath.h new file mode 100644 index 00000000..87badc07 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -0,0 +1,670 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * PacketMath.h + * + * \brief: + * PacketMath + * + *****************************************************************/ + +#ifndef EIGEN_PACKET_MATH_SYCL_H +#define EIGEN_PACKET_MATH_SYCL_H +#include +namespace Eigen { + +namespace internal { +#ifdef SYCL_DEVICE_ONLY + +#define SYCL_PLOADT_RO(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + typedef typename unpacket_traits::type scalar; \ + typedef cl::sycl::multi_ptr< \ + scalar, cl::sycl::access::address_space::address_space_target> \ + multi_ptr; \ + auto res = packet_type( \ + static_cast::type>(0)); \ + res.load(0, multi_ptr(const_cast(from))); \ + return res; \ + } + +SYCL_PLOADT_RO(global_space) +SYCL_PLOADT_RO(local_space) +#undef SYCL_PLOADT_RO +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type +ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, T>& from) { + return ploadt_ro(from.get_pointer()); +} + +#ifdef SYCL_DEVICE_ONLY +#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + return ploadt_ro(from); \ + } + +// global space +SYCL_PLOAD(global_space, Unaligned, u) +SYCL_PLOAD(global_space, Aligned, ) +// local space +SYCL_PLOAD(local_space, Unaligned, u) +SYCL_PLOAD(local_space, Aligned, ) + +#undef SYCL_PLOAD +#endif + +#define SYCL_PLOAD(Alignment, AlignedType) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ + const Eigen::TensorSycl::internal::RangeAccess< \ + cl::sycl::access::mode::read_write, \ + typename unpacket_traits::type> \ + from) { \ + return ploadt_ro(from); \ + } +SYCL_PLOAD(Unaligned, u) +SYCL_PLOAD(Aligned, ) +#undef SYCL_PLOAD + +#ifdef SYCL_DEVICE_ONLY +/** \internal \returns a packet version of \a *from. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +#define SYCL_PLOADT(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + if (Alignment >= unpacket_traits::alignment) \ + return pload(from); \ + else \ + return ploadu(from); \ + } + +// global space +SYCL_PLOADT(global_space) +// local space +SYCL_PLOADT(local_space) +#undef SYCL_PLOADT +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type +ploadt(const Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, + typename unpacket_traits::type>& from) { + return ploadt(from.get_pointer()); +} +#ifdef SYCL_DEVICE_ONLY + +// private_space +#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \ + ploadt_ro( \ + const typename unpacket_traits::type* from) { \ + typedef typename unpacket_traits::type scalar; \ + auto res = packet_type(static_cast(0)); \ + res.template load( \ + 0, const_cast(from)); \ + return res; \ + } + +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned) + +#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \ + const typename unpacket_traits::type* from) { \ + typedef typename unpacket_traits::type scalar; \ + auto res = packet_type(static_cast(0)); \ + res.template load( \ + 0, const_cast(from)); \ + return res; \ + } +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, ) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, ) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u) + +#undef SYCL_PLOAD_SPECIAL + +#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ + typename cl::sycl::multi_ptr< \ + scalar, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + to, \ + const packet_type& from) { \ + typedef cl::sycl::multi_ptr< \ + scalar, cl::sycl::access::address_space::address_space_target> \ + multi_ptr; \ + from.store(0, multi_ptr(to)); \ + } + +// global space +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u) +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u) + +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u) +#undef SYCL_PSTORE + +#define SYCL_PSTORE_T(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \ + typename cl::sycl::multi_ptr< \ + scalar, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + to, \ + const packet_type& from) { \ + if (Alignment) \ + pstore(to, from); \ + else \ + pstoreu(to, from); \ + } + +SYCL_PSTORE_T(global_space) + +SYCL_PSTORE_T(local_space) + +#undef SYCL_PSTORE_T + +#define SYCL_PSET1(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1( \ + const typename unpacket_traits::type& from) { \ + return packet_type(from); \ + } + +// global space +SYCL_PSET1(cl::sycl::cl_float4) +SYCL_PSET1(cl::sycl::cl_double2) + +#undef SYCL_PSET1 + +template +struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type + get_ploaddup(sycl_multi_pointer) {} + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type + get_pgather(sycl_multi_pointer, Index) {} +}; + +template <> +struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup( + sycl_multi_pointer from) { + return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather( + sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_float4(from[0 * stride], from[1 * stride], + from[2 * stride], from[3 * stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter( + sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) { + auto tmp = stride; + to[0] = from.x(); + to[tmp] = from.y(); + to[tmp += stride] = from.z(); + to[tmp += stride] = from.w(); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset( + const float& a) { + return cl::sycl::cl_float4(static_cast(a), static_cast(a + 1), + static_cast(a + 2), + static_cast(a + 3)); + } +}; + +template <> +struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 + get_ploaddup(const sycl_multi_pointer from) { + return cl::sycl::cl_double2(from[0], from[0]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather( + const sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter( + sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) { + to[0] = from.x(); + to[stride] = from.y(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset( + const double& a) { + return cl::sycl::cl_double2(static_cast(a), + static_cast(a + 1)); + } +}; + +#define SYCL_PLOAD_DUP(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + return get_base_packet::get_ploaddup(from); \ + } + +// global space +SYCL_PLOAD_DUP(global_space) +// local_space +SYCL_PLOAD_DUP(local_space) +#undef SYCL_PLOAD_DUP + +#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ + const typename unpacket_traits::type* from) { \ + return get_base_packet::get_ploaddup(from); \ + } + +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4) +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) + +#undef SYCL_PLOAD_DUP_SPECILIZE + +#define SYCL_PLSET(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset( \ + const typename unpacket_traits::type& a) { \ + return get_base_packet::set_plset(a); \ + } + +SYCL_PLSET(cl::sycl::cl_float4) +SYCL_PLSET(cl::sycl::cl_double2) + +#undef SYCL_PLSET + +#define SYCL_PGATHER(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC inline packet_type pgather( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from, \ + Index stride) { \ + return get_base_packet::get_pgather(from, stride); \ + } + +// global space +SYCL_PGATHER(global_space) +// local space +SYCL_PGATHER(local_space) + +#undef SYCL_PGATHER + +#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ + pgather( \ + const typename unpacket_traits::type* from, Index stride) { \ + return get_base_packet::get_pgather(from, stride); \ + } + +SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PGATHER_SPECILIZE + +#define SYCL_PSCATTER(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC inline void pscatter( \ + typename cl::sycl::multi_ptr< \ + typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + to, \ + const packet_type& from, Index stride) { \ + get_base_packet::set_pscatter(to, from, stride); \ + } + +// global space +SYCL_PSCATTER(global_space) +// local space +SYCL_PSCATTER(local_space) + +#undef SYCL_PSCATTER + +#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( \ + typename unpacket_traits::type * to, \ + const packet_type& from, Index stride) { \ + get_base_packet::set_pscatter(to, from, stride); \ + } + +SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PSCATTER_SPECILIZE + +#define SYCL_PMAD(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( \ + const packet_type& a, const packet_type& b, const packet_type& c) { \ + return cl::sycl::mad(a, b, c); \ + } + +SYCL_PMAD(cl::sycl::cl_float4) +SYCL_PMAD(cl::sycl::cl_double2) +#undef SYCL_PMAD + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst( + const cl::sycl::cl_float4& a) { + return a.x(); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst( + const cl::sycl::cl_double2& a) { + return a.x(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux( + const cl::sycl::cl_float4& a) { + return a.x() + a.y() + a.z() + a.w(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux( + const cl::sycl::cl_double2& a) { + return a.x() + a.y(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max( + const cl::sycl::cl_float4& a) { + return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), + cl::sycl::fmax(a.z(), a.w())); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max( + const cl::sycl::cl_double2& a) { + return cl::sycl::fmax(a.x(), a.y()); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min( + const cl::sycl::cl_float4& a) { + return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), + cl::sycl::fmin(a.z(), a.w())); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min( + const cl::sycl::cl_double2& a) { + return cl::sycl::fmin(a.x(), a.y()); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul( + const cl::sycl::cl_float4& a) { + return a.x() * a.y() * a.z() * a.w(); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul( + const cl::sycl::cl_double2& a) { + return a.x() * a.y(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pabs(const cl::sycl::cl_float4& a) { + return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), + cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w())); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 +pabs(const cl::sycl::cl_double2& a) { + return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y())); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a, + const Packet &b) { + return ((a <= b) + .template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a, + const Packet &b) { + return ((a < b) + .template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a, + const Packet &b) { + return ((a == b) + .template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +#define SYCL_PCMP(OP, TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP(const TYPE &a, \ + const TYPE &b) { \ + return sycl_pcmp_##OP(a, b); \ + } + +SYCL_PCMP(le, cl::sycl::cl_float4) +SYCL_PCMP(lt, cl::sycl::cl_float4) +SYCL_PCMP(eq, cl::sycl::cl_float4) +SYCL_PCMP(le, cl::sycl::cl_double2) +SYCL_PCMP(lt, cl::sycl::cl_double2) +SYCL_PCMP(eq, cl::sycl::cl_double2) +#undef SYCL_PCMP + +template struct convert_to_integer; + +template <> struct convert_to_integer { + using type = std::int32_t; + using packet_type = cl::sycl::cl_int4; +}; +template <> struct convert_to_integer { + using type = std::int64_t; + using packet_type = cl::sycl::cl_long2; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename convert_to_integer< + typename unpacket_traits::type>::packet_type +vector_as_int(const PacketIn &p) { + return ( + p.template convert::type>::type, + cl::sycl::rounding_mode::automatic>()); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packetOut +convert_vector(const PacketIn &p) { + return (p.template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +#define SYCL_PAND(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pand(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) & vector_as_int(b)); \ + } +SYCL_PAND(cl::sycl::cl_float4) +SYCL_PAND(cl::sycl::cl_double2) +#undef SYCL_PAND + +#define SYCL_POR(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE por(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) | vector_as_int(b)); \ + } + +SYCL_POR(cl::sycl::cl_float4) +SYCL_POR(cl::sycl::cl_double2) +#undef SYCL_POR + +#define SYCL_PXOR(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pxor(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) ^ vector_as_int(b)); \ + } + +SYCL_PXOR(cl::sycl::cl_float4) +SYCL_PXOR(cl::sycl::cl_double2) +#undef SYCL_PXOR + +#define SYCL_PANDNOT(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pandnot(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) & (~vector_as_int(b))); \ + } +SYCL_PANDNOT(cl::sycl::cl_float4) +SYCL_PANDNOT(cl::sycl::cl_double2) +#undef SYCL_PANDNOT + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( + PacketBlock& kernel) { + float tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; + + tmp = kernel.packet[0].z(); + kernel.packet[0].z() = kernel.packet[2].x(); + kernel.packet[2].x() = tmp; + + tmp = kernel.packet[0].w(); + kernel.packet[0].w() = kernel.packet[3].x(); + kernel.packet[3].x() = tmp; + + tmp = kernel.packet[1].z(); + kernel.packet[1].z() = kernel.packet[2].y(); + kernel.packet[2].y() = tmp; + + tmp = kernel.packet[1].w(); + kernel.packet[1].w() = kernel.packet[3].y(); + kernel.packet[3].y() = tmp; + + tmp = kernel.packet[2].w(); + kernel.packet[2].w() = kernel.packet[3].z(); + kernel.packet[3].z() = tmp; +} + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( + PacketBlock& kernel) { + double tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend( + const Selector::size>& ifPacket, + const cl::sycl::cl_float4& thenPacket, + const cl::sycl::cl_float4& elsePacket) { + cl::sycl::cl_int4 condition( + ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, + ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} + +template <> +inline cl::sycl::cl_double2 pblend( + const Selector::size>& ifPacket, + const cl::sycl::cl_double2& thenPacket, + const cl::sycl::cl_double2& elsePacket) { + cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, + ifPacket.select[1] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} +#endif // SYCL_DEVICE_ONLY + +#define SYCL_PSTORE(alignment) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ + const Eigen::TensorSycl::internal::RangeAccess< \ + cl::sycl::access::mode::read_write, \ + typename unpacket_traits::type>& to, \ + const packet_type& from) { \ + pstore##alignment(to.get_pointer(), from); \ + } + +// global space +SYCL_PSTORE() +SYCL_PSTORE(u) + +#undef SYCL_PSTORE + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( + Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, + typename unpacket_traits::type> + to, + const packet_type& from) { + pstoret(to.get_pointer(), from); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_SYCL_H diff --git a/inst/include/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/inst/include/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h new file mode 100644 index 00000000..f81e59db --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h @@ -0,0 +1,694 @@ +/*************************************************************************** + * Copyright (C) 2017 Codeplay Software Limited + * This Source Code Form is subject to the terms of the Mozilla + * Public License v. 2.0. If a copy of the MPL was not distributed + * with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * + * SyclMemoryModel.h + * + * Description: + * Interface for SYCL buffers to behave as a non-dereferenceable pointer + * Interface for Placeholder accessor to behave as a pointer on both host + * and device + * + * Authors: + * + * Ruyman Reyes Codeplay Software Ltd. + * Mehdi Goli Codeplay Software Ltd. + * Vanya Yaneva Codeplay Software Ltd. + * + **************************************************************************/ + +#if defined(EIGEN_USE_SYCL) && \ + !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H) +#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H + +#include +#ifdef EIGEN_EXCEPTIONS +#include +#endif +#include +#include +#include +#include + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +using sycl_acc_target = cl::sycl::access::target; +using sycl_acc_mode = cl::sycl::access::mode; + +/** + * Default values for template arguments + */ +using buffer_data_type_t = uint8_t; +const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer; +const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write; + +/** + * PointerMapper + * Associates fake pointers with buffers. + * + */ +class PointerMapper { + public: + using base_ptr_t = std::intptr_t; + + /* Structure of a virtual pointer + * + * |================================================| + * | POINTER ADDRESS | + * |================================================| + */ + struct virtual_pointer_t { + /* Type for the pointers + */ + base_ptr_t m_contents; + + /** Conversions from virtual_pointer_t to + * void * should just reinterpret_cast the integer number + */ + operator void *() const { return reinterpret_cast(m_contents); } + + /** + * Convert back to the integer number. + */ + operator base_ptr_t() const { return m_contents; } + + /** + * Add a certain value to the pointer to create a + * new pointer to that offset + */ + virtual_pointer_t operator+(size_t off) { return m_contents + off; } + + /* Numerical order for sorting pointers in containers. */ + bool operator<(virtual_pointer_t rhs) const { + return (static_cast(m_contents) < + static_cast(rhs.m_contents)); + } + + bool operator>(virtual_pointer_t rhs) const { + return (static_cast(m_contents) > + static_cast(rhs.m_contents)); + } + + /** + * Numerical order for sorting pointers in containers + */ + bool operator==(virtual_pointer_t rhs) const { + return (static_cast(m_contents) == + static_cast(rhs.m_contents)); + } + + /** + * Simple forward to the equality overload. + */ + bool operator!=(virtual_pointer_t rhs) const { + return !(this->operator==(rhs)); + } + + /** + * Converts a void * into a virtual pointer structure. + * Note that this will only work if the void * was + * already a virtual_pointer_t, but we have no way of + * checking + */ + virtual_pointer_t(const void *ptr) + : m_contents(reinterpret_cast(ptr)){}; + + /** + * Creates a virtual_pointer_t from the given integer + * number + */ + virtual_pointer_t(base_ptr_t u) : m_contents(u){}; + }; + + /* Definition of a null pointer + */ + const virtual_pointer_t null_virtual_ptr = nullptr; + + /** + * Whether if a pointer is null or not. + * A pointer is nullptr if the value is of null_virtual_ptr + */ + static inline bool is_nullptr(virtual_pointer_t ptr) { + return (static_cast(ptr) == nullptr); + } + + /* basic type for all buffers + */ + using buffer_t = cl::sycl::buffer_mem; + + /** + * Node that stores information about a device allocation. + * Nodes are sorted by size to organise a free list of nodes + * that can be recovered. + */ + struct pMapNode_t { + buffer_t m_buffer; + size_t m_size; + bool m_free; + + pMapNode_t(buffer_t b, size_t size, bool f) + : m_buffer{b}, m_size{size}, m_free{f} { + m_buffer.set_final_data(nullptr); + } + + bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); } + }; + + /** Storage of the pointer / buffer tree + */ + using pointerMap_t = std::map; + + /** + * Obtain the insertion point in the pointer map for + * a pointer of the given size. + * \param requiredSize Size attemted to reclaim + */ + typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) { + typename pointerMap_t::iterator retVal; + bool reuse = false; + if (!m_freeList.empty()) { + // try to re-use an existing block + for (auto freeElem : m_freeList) { + if (freeElem->second.m_size >= requiredSize) { + retVal = freeElem; + reuse = true; + // Element is not going to be free anymore + m_freeList.erase(freeElem); + break; + } + } + } + if (!reuse) { + retVal = std::prev(m_pointerMap.end()); + } + return retVal; + } + + /** + * Returns an iterator to the node that stores the information + * of the given virtual pointer from the given pointer map structure. + * If pointer is not found, throws std::out_of_range. + * If the pointer map structure is empty, throws std::out_of_range + * + * \param pMap the pointerMap_t structure storing all the pointers + * \param virtual_pointer_ptr The virtual pointer to obtain the node of + * \throws std::out:of_range if the pointer is not found or pMap is empty + */ + typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) { + if (this->count() == 0) { + m_pointerMap.clear(); + EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n")); + + } + if (is_nullptr(ptr)) { + m_pointerMap.clear(); + EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n")); + } + // The previous element to the lower bound is the node that + // holds this memory address + auto node = m_pointerMap.lower_bound(ptr); + // If the value of the pointer is not the one of the node + // then we return the previous one + if (node == std::end(m_pointerMap)) { + --node; + } else if (node->first != ptr) { + if (node == std::begin(m_pointerMap)) { + m_pointerMap.clear(); + EIGEN_THROW_X( + std::out_of_range("The pointer is not registered in the map\n")); + + } + --node; + } + + return node; + } + + /* get_buffer. + * Returns a buffer from the map using the pointer address + */ + template + cl::sycl::buffer get_buffer( + const virtual_pointer_t ptr) { + using sycl_buffer_t = cl::sycl::buffer; + + // get_node() returns a `buffer_mem`, so we need to cast it to a `buffer<>`. + // We can do this without the `buffer_mem` being a pointer, as we + // only declare member variables in the base class (`buffer_mem`) and not in + // the child class (`buffer<>). + auto node = get_node(ptr); + eigen_assert(node->first == ptr || node->first < ptr); + eigen_assert(ptr < static_cast(node->second.m_size + + node->first)); + return *(static_cast(&node->second.m_buffer)); + } + + /** + * @brief Returns an accessor to the buffer of the given virtual pointer + * @param accessMode + * @param accessTarget + * @param ptr The virtual pointer + */ + template + cl::sycl::accessor + get_access(const virtual_pointer_t ptr) { + auto buf = get_buffer(ptr); + return buf.template get_access(); + } + + /** + * @brief Returns an accessor to the buffer of the given virtual pointer + * in the given command group scope + * @param accessMode + * @param accessTarget + * @param ptr The virtual pointer + * @param cgh Reference to the command group scope + */ + template + cl::sycl::accessor + get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) { + auto buf = get_buffer(ptr); + return buf.template get_access(cgh); + } + + /* + * Returns the offset from the base address of this pointer. + */ + inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) { + // The previous element to the lower bound is the node that + // holds this memory address + auto node = get_node(ptr); + auto start = node->first; + eigen_assert(start == ptr || start < ptr); + eigen_assert(ptr < start + node->second.m_size); + return (ptr - start); + } + + /* + * Returns the number of elements by which the given pointer is offset from + * the base address. + */ + template + inline size_t get_element_offset(const virtual_pointer_t ptr) { + return get_offset(ptr) / sizeof(buffer_data_type); + } + + /** + * Constructs the PointerMapper structure. + */ + PointerMapper(base_ptr_t baseAddress = 4096) + : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} { + if (m_baseAddress == 0) { + EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n")); + } + }; + + /** + * PointerMapper cannot be copied or moved + */ + PointerMapper(const PointerMapper &) = delete; + + /** + * Empty the pointer list + */ + inline void clear() { + m_freeList.clear(); + m_pointerMap.clear(); + } + + /* add_pointer. + * Adds an existing pointer to the map and returns the virtual pointer id. + */ + inline virtual_pointer_t add_pointer(const buffer_t &b) { + return add_pointer_impl(b); + } + + /* add_pointer. + * Adds a pointer to the map and returns the virtual pointer id. + */ + inline virtual_pointer_t add_pointer(buffer_t &&b) { + return add_pointer_impl(b); + } + + /** + * @brief Fuses the given node with the previous nodes in the + * pointer map if they are free + * + * @param node A reference to the free node to be fused + */ + void fuse_forward(typename pointerMap_t::iterator &node) { + while (node != std::prev(m_pointerMap.end())) { + // if following node is free + // remove it and extend the current node with its size + auto fwd_node = std::next(node); + if (!fwd_node->second.m_free) { + break; + } + auto fwd_size = fwd_node->second.m_size; + m_freeList.erase(fwd_node); + m_pointerMap.erase(fwd_node); + + node->second.m_size += fwd_size; + } + } + + /** + * @brief Fuses the given node with the following nodes in the + * pointer map if they are free + * + * @param node A reference to the free node to be fused + */ + void fuse_backward(typename pointerMap_t::iterator &node) { + while (node != m_pointerMap.begin()) { + // if previous node is free, extend it + // with the size of the current one + auto prev_node = std::prev(node); + if (!prev_node->second.m_free) { + break; + } + prev_node->second.m_size += node->second.m_size; + + // remove the current node + m_freeList.erase(node); + m_pointerMap.erase(node); + + // point to the previous node + node = prev_node; + } + } + + /* remove_pointer. + * Removes the given pointer from the map. + * The pointer is allowed to be reused only if ReUse if true. + */ + template + void remove_pointer(const virtual_pointer_t ptr) { + if (is_nullptr(ptr)) { + return; + } + auto node = this->get_node(ptr); + + node->second.m_free = true; + m_freeList.emplace(node); + + // Fuse the node + // with free nodes before and after it + fuse_forward(node); + fuse_backward(node); + + // If after fusing the node is the last one + // simply remove it (since it is free) + if (node == std::prev(m_pointerMap.end())) { + m_freeList.erase(node); + m_pointerMap.erase(node); + } + } + + /* count. + * Return the number of active pointers (i.e, pointers that + * have been malloc but not freed). + */ + size_t count() const { return (m_pointerMap.size() - m_freeList.size()); } + + private: + /* add_pointer_impl. + * Adds a pointer to the map and returns the virtual pointer id. + * BufferT is either a const buffer_t& or a buffer_t&&. + */ + template + virtual_pointer_t add_pointer_impl(BufferT b) { + virtual_pointer_t retVal = nullptr; + size_t bufSize = b.get_count(); + pMapNode_t p{b, bufSize, false}; + // If this is the first pointer: + if (m_pointerMap.empty()) { + virtual_pointer_t initialVal{m_baseAddress}; + m_pointerMap.emplace(initialVal, p); + return initialVal; + } + + auto lastElemIter = get_insertion_point(bufSize); + // We are recovering an existing free node + if (lastElemIter->second.m_free) { + lastElemIter->second.m_buffer = b; + lastElemIter->second.m_free = false; + + // If the recovered node is bigger than the inserted one + // add a new free node with the remaining space + if (lastElemIter->second.m_size > bufSize) { + // create a new node with the remaining space + auto remainingSize = lastElemIter->second.m_size - bufSize; + pMapNode_t p2{b, remainingSize, true}; + + // update size of the current node + lastElemIter->second.m_size = bufSize; + + // add the new free node + auto newFreePtr = lastElemIter->first + bufSize; + auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first; + m_freeList.emplace(freeNode); + } + + retVal = lastElemIter->first; + } else { + size_t lastSize = lastElemIter->second.m_size; + retVal = lastElemIter->first + lastSize; + m_pointerMap.emplace(retVal, p); + } + return retVal; + } + + /** + * Compare two iterators to pointer map entries according to + * the size of the allocation on the device. + */ + struct SortBySize { + bool operator()(typename pointerMap_t::iterator a, + typename pointerMap_t::iterator b) const { + return ((a->first < b->first) && (a->second <= b->second)) || + ((a->first < b->first) && (b->second <= a->second)); + } + }; + + /* Maps the pointer addresses to buffer and size pairs. + */ + pointerMap_t m_pointerMap; + + /* List of free nodes available for re-using + */ + std::set m_freeList; + + /* Base address used when issuing the first virtual pointer, allows users + * to specify alignment. Cannot be zero. */ + std::intptr_t m_baseAddress; +}; + +/* remove_pointer. + * Removes the given pointer from the map. + * The pointer is allowed to be reused only if ReUse if true. + */ +template <> +inline void PointerMapper::remove_pointer(const virtual_pointer_t ptr) { + if (is_nullptr(ptr)) { + return; + } + m_pointerMap.erase(this->get_node(ptr)); +} + +/** + * Malloc-like interface to the pointer-mapper. + * Given a size, creates a byte-typed buffer and returns a + * fake pointer to keep track of it. + * \param size Size in bytes of the desired allocation + * \throw cl::sycl::exception if error while creating the buffer + */ +inline void *SYCLmalloc(size_t size, PointerMapper &pMap) { + if (size == 0) { + return nullptr; + } + // Create a generic buffer of the given size + using buffer_t = cl::sycl::buffer; + auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size})); + // Store the buffer on the global list + return static_cast(thePointer); +} + +/** + * Free-like interface to the pointer mapper. + * Given a fake-pointer created with the virtual-pointer malloc, + * destroys the buffer and remove it from the list. + * If ReUse is false, the pointer is not added to the freeList, + * it should be false only for sub-buffers. + */ +template +inline void SYCLfree(void *ptr, PointerMapper &pMap) { + pMap.template remove_pointer(ptr); +} + +/** + * Clear all the memory allocated by SYCL. + */ +template +inline void SYCLfreeAll(PointerMapper &pMap) { + pMap.clear(); +} + +template +struct RangeAccess { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef T scalar_t; + typedef scalar_t &ref_t; + typedef typename cl::sycl::global_ptr::pointer_t ptr_t; + + // the accessor type does not necessarily the same as T + typedef cl::sycl::accessor + accessor; + + typedef RangeAccess self_t; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access, + size_t offset, + std::intptr_t virtual_ptr) + : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {} + + RangeAccess(cl::sycl::buffer buff = + cl::sycl::buffer(cl::sycl::range<1>(1))) + : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {} + + // This should be only used for null constructor on the host side + RangeAccess(std::nullptr_t) : RangeAccess() {} + // This template parameter must be removed and scalar_t should be replaced + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const { + return (access_.get_pointer().get() + offset_); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) { + offset_ += (offset); + return *this; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const { + return self_t(access_, offset_ + offset, virtual_ptr_); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const { + return self_t(access_, offset_ - offset, virtual_ptr_); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) { + offset_ -= offset; + return *this; + } + + // THIS IS FOR NULL COMPARISON ONLY + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( + const RangeAccess &lhs, std::nullptr_t) { + return ((lhs.virtual_ptr_ == -1)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( + const RangeAccess &lhs, std::nullptr_t i) { + return !(lhs == i); + } + + // THIS IS FOR NULL COMPARISON ONLY + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( + std::nullptr_t, const RangeAccess &rhs) { + return ((rhs.virtual_ptr_ == -1)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( + std::nullptr_t i, const RangeAccess &rhs) { + return !(i == rhs); + } + // Prefix operator (Increment and return value) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() { + offset_++; + return (*this); + } + + // Postfix operator (Return value and increment) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) { + EIGEN_UNUSED_VARIABLE(i); + self_t temp_iterator(*this); + offset_++; + return temp_iterator; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const { + return (access_.get_count() - offset_); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const { + return offset_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) { + offset_ = offset; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const { + return *get_pointer(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() { + return *get_pointer(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) { + return *(get_pointer() + x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const { + return *(get_pointer() + x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const { + return reinterpret_cast(virtual_ptr_ + + (offset_ * sizeof(scalar_t))); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const { + return (virtual_ptr_ != -1); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess() { + return RangeAccess(access_, offset_, virtual_ptr_); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + operator RangeAccess() const { + return RangeAccess(access_, offset_, virtual_ptr_); + } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( + cl::sycl::handler &cgh) const { + cgh.require(access_); + } + + private: + accessor access_; + size_t offset_; + std::intptr_t virtual_ptr_; // the location of the buffer in the map +}; + +template +struct RangeAccess : RangeAccess { + typedef RangeAccess Base; + using Base::Base; +}; + +} // namespace internal +} // namespace TensorSycl +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H diff --git a/inst/include/Eigen/src/Core/arch/SYCL/TypeCasting.h b/inst/include/Eigen/src/Core/arch/SYCL/TypeCasting.h new file mode 100644 index 00000000..9208ab21 --- /dev/null +++ b/inst/include/Eigen/src/Core/arch/SYCL/TypeCasting.h @@ -0,0 +1,85 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TypeCasting.h + * + * \brief: + * TypeCasting + * + *****************************************************************/ + +#ifndef EIGEN_TYPE_CASTING_SYCL_H +#define EIGEN_TYPE_CASTING_SYCL_H + +namespace Eigen { + +namespace internal { +#ifdef SYCL_DEVICE_ONLY +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 +pcast(const cl::sycl::cl_float4& a) { + return a + .template convert(); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pcast(const cl::sycl::cl_int4& a) { + return a.template convert(); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pcast( + const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) { + auto a1 = a.template convert(); + auto b1 = b.template convert(); + return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y()); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 +pcast(const cl::sycl::cl_float4& a) { + // Simply discard the second half of the input + return cl::sycl::cl_double2(a.x(), a.y()); +} + +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SYCL_H diff --git a/inst/include/Eigen/src/Core/arch/ZVector/Complex.h b/inst/include/Eigen/src/Core/arch/ZVector/Complex.h index 1bfb7339..0b9b33d9 100644 --- a/inst/include/Eigen/src/Core/arch/ZVector/Complex.h +++ b/inst/include/Eigen/src/Core/arch/ZVector/Complex.h @@ -15,6 +15,10 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); +#endif + static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -29,10 +33,14 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) union { Packet4f v; Packet1cd cd[2]; }; +#else + Packet4f v; +#endif }; template<> struct packet_traits > : default_packet_traits @@ -83,69 +91,33 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); -template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +/* complex first */ template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - Packet2cf res; - res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); - res.cd[1] = res.cd[0]; - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) { return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) { pstore >(to, from); } - -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - Packet2cf res; - res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; - res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d a_re, a_im, v1, v2; @@ -163,27 +135,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet2cf res; - res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; - res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; - return res; -} - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { + Packet2d eq = vec_cmpeq (a.v, b.v); + Packet2d tmp = { eq[1], eq[0] }; + return (Packet1cd)pand(eq, tmp); +} -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) @@ -193,160 +155,157 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac return res; } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); - - return res[0]; -} template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { - Packet2cf res; - res.cd[0] = a.cd[1]; - res.cd[1] = a.cd[0]; - return res; + return pfirst(a); } - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - std::complex res; - Packet1cd b = padd(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; + // TODO optimize it for AltiVec + Packet1cd res = pmul(a,pconj(b)); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { - return vecs[0]; + return Packet1cd(preverse(Packet2d(x.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - ptranspose(transpose); + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} - return padd(transpose.packet[0], transpose.packet[1]); -} +/* complex follows */ +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - return pfirst(a); + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + + return res[0]; } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - std::complex res; - Packet1cd b = pmul(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; return res; } - -template -struct palign_impl +#else +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; + Packet2cf res; + if((std::ptrdiff_t(&from) % 16) == 0) + res.v = pload((const float *)&from); + else + res.v = ploadu((const float *)&from); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); + return res; +} +#endif -template -struct palign_impl +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset == 1) { - first.cd[0] = first.cd[1]; - first.cd[1] = second.cd[0]; - } - } -}; - -template<> struct conj_helper + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { + Packet4f eq = pcmp_eq (a.v, b.v); + Packet2cf res; + Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] }; + Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] }; + res.v.v4f[0] = pand(eq.v4f[0], tmp1); + res.v.v4f[1] = pand(eq.v4f[1], tmp2); + return res; +} -template<> struct conj_helper +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} -template<> struct conj_helper +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; +} - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} -template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { - // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; } +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec @@ -356,11 +315,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return res; } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) { Packet2cf res; @@ -369,13 +323,6 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) return res; } -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); - kernel.packet[0].v = tmp; -} - EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet1cd tmp = kernel.packet[0].cd[1]; @@ -389,6 +336,88 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); return result; } +#else +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { + Packet4f eq = vec_cmpeq (a.v, b.v); + Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] }; + return (Packet2cf)pand(eq, tmp); +} +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f a_re, a_im, prod, prod_im; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + + // multiply a_im * b and get the conjugate result + prod_im = a_im * b.v; + prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); + // permute back to a proper order + prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); + + // multiply a_re * b, add prod_im + prod = pmadd(a_re, b.v, prod_im); + + return Packet2cf(prod); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet4f rev_a; + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); + return Packet2cf(rev_a); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + Packet4f b; + b = vec_sld(a.v, a.v, 8); + b = padd(a.v, b); + return pfirst(Packet2cf(b)); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + Packet4f b; + Packet2cf prod; + b = vec_sld(a.v, a.v, 8); + prod = pmul(a, Packet2cf(b)); + + return pfirst(prod); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res = pmul(a, pconj(b)); + Packet4f s = pmul(b.v, b.v); + return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); + return result; +} +#endif } // end namespace internal diff --git a/inst/include/Eigen/src/Core/arch/ZVector/MathFunctions.h b/inst/include/Eigen/src/Core/arch/ZVector/MathFunctions.h index 5c7aa725..1635e128 100644 --- a/inst/include/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/inst/include/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -20,6 +20,50 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); + +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +#endif + static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); @@ -93,43 +137,95 @@ Packet2d pexp(const Packet2d& _x) } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& x) +Packet4f pexp(const Packet4f& _x) { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + Packet4f x = _x; + + Packet4f tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + // express exp(x) as exp(g + n*log(2)) + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] }; + emm0 = emm0 + p4i_0x7f; + emm0 = emm0 << reinterpret_cast(p4i_23); + + return pmax(pmul(y, reinterpret_cast(emm0)), _x); +#else Packet4f res; - res.v4f[0] = pexp(x.v4f[0]); - res.v4f[1] = pexp(x.v4f[1]); + res.v4f[0] = pexp(_x.v4f[0]); + res.v4f[1] = pexp(_x.v4f[1]); return res; +#endif } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { - return __builtin_s390_vfsqdb(x); + return vec_sqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = vec_sqrt(x); +#else res.v4f[0] = psqrt(x.v4f[0]); res.v4f[1] = psqrt(x.v4f[1]); +#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = pset1(1.0) / psqrt(x); +#else res.v4f[0] = prsqrt(x.v4f[0]); res.v4f[1] = prsqrt(x.v4f[1]); +#endif return res; } +// Hyperbolic Tangent function. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& x) { + return internal::generic_fast_tanh_float(x); +} + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/arch/ZVector/PacketMath.h b/inst/include/Eigen/src/Core/arch/ZVector/PacketMath.h index 57b01fc6..1f55a90a 100755 --- a/inst/include/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/inst/include/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -10,26 +10,20 @@ #ifndef EIGEN_PACKET_MATH_ZVECTOR_H #define EIGEN_PACKET_MATH_ZVECTOR_H -#include - namespace Eigen { namespace internal { #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif typedef __vector int Packet4i; @@ -41,21 +35,30 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +// Z14 has builtin support for float vectors +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +typedef __vector float Packet4f; +#else typedef struct { Packet2d v4f[2]; } Packet4f; +#endif typedef union { - int32_t i[4]; - uint32_t ui[4]; - int64_t l[2]; - uint64_t ul[2]; + numext::int32_t i[4]; + numext::uint32_t ui[4]; + numext::int64_t l[2]; + numext::uint64_t ul[2]; double d[2]; + float f[4]; Packet4i v4i; Packet4ui v4ui; Packet2l v2l; Packet2ul v2ul; Packet2d v2d; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + Packet4f v4f; +#endif } Packet; // We don't want to write the same code all the time, but we need to reuse the constants @@ -80,15 +83,31 @@ typedef union { Packet2l p2l_##NAME = pset1(X) // These constants are endian-agnostic -//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); -static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { numext::bit_cast0x8000000000000000ull), + numext::bit_cast0x8000000000000000ull) }; + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) + +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; +#endif static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; @@ -120,9 +139,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; -//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -149,29 +168,31 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4, + size = 4, HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasTanh = 1, + HasErf = 1, HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -211,9 +232,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); @@ -258,82 +279,15 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } -/* Helper function to simulate a vec_splat_packet4f - */ -template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { - Packet4f splat; - switch (element) { - case 0: - splat.v4f[0] = vec_splat(from.v4f[0], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 1: - splat.v4f[0] = vec_splat(from.v4f[0], 1); - splat.v4f[1] = splat.v4f[0]; - break; - case 2: - splat.v4f[0] = vec_splat(from.v4f[1], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 3: - splat.v4f[0] = vec_splat(from.v4f[1], 1); - splat.v4f[1] = splat.v4f[0]; - break; - } - return splat; + Packet vt; + vt.v4f = v; + s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3]; + return s; } - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } - } -}; - -/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double - */ -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); - first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); - break; - case 2: - first.v4f[0] = first.v4f[1]; - first.v4f[1] = second.v4f[0]; - break; - case 3: - first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); - first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); - break; - } - } -}; - - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset == 1) - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); - } -}; +#endif template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { @@ -344,16 +298,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) return vfrom->v4i; } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet4f vfrom; - vfrom.v4f[0] = vec_ld2f(&from[0]); - vfrom.v4f[1] = vec_ld2f(&from[2]); - return vfrom; -} - template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { // FIXME: No intrinsic yet @@ -372,15 +316,6 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f vto->v4i = from; } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - vec_st2f(from.v4f[0], &to[0]); - vec_st2f(from.v4f[1], &to[2]); -} - - template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { // FIXME: No intrinsic yet @@ -397,13 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vec_splats(from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) -{ - Packet4f to; - to.v4f[0] = pset1(static_cast(from)); - to.v4f[1] = to.v4f[0]; - return to; -} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, @@ -416,17 +344,6 @@ pbroadcast4(const int *a, a3 = vec_splat(a3, 3); } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec_splat_packet4f<0>(a3); - a1 = vec_splat_packet4f<1>(a3); - a2 = vec_splat_packet4f<2>(a3); - a3 = vec_splat_packet4f<3>(a3); -} - template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -449,16 +366,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f return pload(ai); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); -} - template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -477,16 +384,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - pstore((float *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; -} - template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -496,160 +393,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] + b.v4f[0]; - c.v4f[1] = a.v4f[1] + b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] - b.v4f[0]; - c.v4f[1] = a.v4f[1] - b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] * b.v4f[0]; - c.v4f[1] = a.v4f[1] * b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] / b.v4f[0]; - c.v4f[1] = a.v4f[1] / b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) -{ - Packet4f c; - c.v4f[0] = -a.v4f[0]; - c.v4f[1] = -a.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ - Packet4f res; - res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); - res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); - res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); - return res; -} -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_round(a.v4f[0]); - res.v4f[1] = vec_round(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_ceil(a.v4f[0]); - res.v4f[1] = vec_ceil(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_floor(a.v4f[0]); - res.v4f[1] = vec_floor(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } @@ -659,14 +448,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p = pload(from); - p.v4f[1] = vec_splat(p.v4f[0], 1); - p.v4f[0] = vec_splat(p.v4f[0], 0); - return p; -} - template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p = pload(from); @@ -674,15 +455,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -695,23 +473,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - Packet4f rev; - rev.v4f[0] = preverse(a.v4f[1]); - rev.v4f[1] = preverse(a.v4f[0]); - return rev; -} - template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = pabs(a.v4f[0]); - res.v4f[1] = pabs(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -730,71 +493,10 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) sum = padd(a, b); return pfirst(sum); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet2d sum; - sum = padd(a.v4f[0], a.v4f[1]); - double first = predux(sum); - return static_cast(first); -} - -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - Packet4i v[4], sum[4]; - - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); - - // Now do the summation: - // Lines 0+1 - sum[0] = padd(sum[0], sum[1]); - // Lines 2+3 - sum[1] = padd(sum[2], sum[3]); - // Add the results - sum[0] = padd(sum[0], sum[1]); - - return sum[0]; -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - Packet2d v[2], sum; - v[0] = padd(vecs[0], reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8))); - v[1] = padd(vecs[1], reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8))); - - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); - - return sum; -} - -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - transpose.packet[2] = vecs[2]; - transpose.packet[3] = vecs[3]; - ptranspose(transpose); - - Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); - sum = padd(sum, transpose.packet[2]); - sum = padd(sum, transpose.packet[3]); - return sum; -} - -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { EIGEN_ALIGN16 int aux[4]; pstore(aux, a); @@ -806,12 +508,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - // Return predux_mul of the subvectors product - return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); -} - // min template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { @@ -826,14 +522,6 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet2d b, res; - b = pmin(a.v4f[0], a.v4f[1]); - res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - // max template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { @@ -849,14 +537,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet2d b, res; - b = pmax(a.v4f[0], a.v4f[1]); - res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -877,6 +557,282 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +/* z13 has no vector float support so we emulate that with double + z14 has proper vector float support. +*/ +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +/* Helper function to simulate a vec_splat_packet4f + */ +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = por(a.v4f[0], b.v4f[0]); + res.v4f[1] = por(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pxor(a.v4f[0], b.v4f[0]); + res.v4f[1] = pxor(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + // Return predux_mul of the subvectors product + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one */ EIGEN_DEVICE_FUNC inline void @@ -915,12 +871,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].v4f[1] = t3.packet[1]; } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; @@ -932,12 +882,177 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons return result; } -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); +template<> Packet4f EIGEN_STRONG_INLINE pcmp_le(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]); + res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]); + return res; +} + +template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]); + res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]); + return res; +} + +template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]); + res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]); + return res; +} + +#else +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4f; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4f = from; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + pstore((float*)af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); +} + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +#endif + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/functors/AssignmentFunctors.h b/inst/include/Eigen/src/Core/functors/AssignmentFunctors.h index 4153b877..bf64ef4e 100644 --- a/inst/include/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/inst/include/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef __CUDACC__ +#ifdef EIGEN_GPUCC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else @@ -157,7 +157,16 @@ template struct functor_traits > { enum { Cost = 3 * NumTraits::ReadCost, - PacketAccess = packet_traits::Vectorizable + PacketAccess = + #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__)) + // This is a partial workaround for a bug in clang generating bad code + // when mixing 256/512 bits loads and 128 bits moves. + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684 + // https://bugs.llvm.org/show_bug.cgi?id=40815 + 0 + #else + packet_traits::Vectorizable + #endif }; }; diff --git a/inst/include/Eigen/src/Core/functors/BinaryFunctors.h b/inst/include/Eigen/src/Core/functors/BinaryFunctors.h index 3eae6b8c..63f09ab9 100644 --- a/inst/include/Eigen/src/Core/functors/BinaryFunctors.h +++ b/inst/include/Eigen/src/Core/functors/BinaryFunctors.h @@ -39,32 +39,26 @@ struct scalar_sum_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::padd(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux(a); } }; template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, // rough estimate! + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasAdd && packet_traits::HasAdd // TODO vectorize mixed sum }; }; -/** \internal - * \brief Template specialization to deprecate the summation of boolean expressions. - * This is required to solve Bug 426. - * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast() - */ -template<> struct scalar_sum_op : scalar_sum_op { - EIGEN_DEPRECATED - scalar_sum_op() {} -}; + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op::operator() (const bool& a, const bool& b) const { return a || b; } /** \internal @@ -83,23 +77,27 @@ struct scalar_product_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux_mul(a); } }; template struct functor_traits > { enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost)/2, // rough estimate! + Cost = (int(NumTraits::MulCost) + int(NumTraits::MulCost))/2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasMul && packet_traits::HasMul // TODO vectorize mixed product }; }; +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op::operator() (const bool& a, const bool& b) const { return a && b; } + + /** \internal * \brief Template functor to compute the conjugate product of two scalars * @@ -116,11 +114,11 @@ struct scalar_conj_product_op : binary_op_base typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return conj_helper().pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return conj_helper().pmul(a,b); } }; template @@ -136,21 +134,28 @@ struct functor_traits > { * * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff() */ -template +template struct scalar_min_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { + return internal::pmin(a, b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmin(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { + return internal::pmin(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_min(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + { + return internal::predux_min(a); + } }; -template -struct functor_traits > { + +template +struct functor_traits > { enum { Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, PacketAccess = internal::is_same::value && packet_traits::HasMin @@ -162,21 +167,28 @@ struct functor_traits > { * * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff() */ -template -struct scalar_max_op : binary_op_base +template +struct scalar_max_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { + return internal::pmax(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmax(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { + return internal::pmax(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_max(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + { + return internal::predux_max(a); + } }; -template -struct functor_traits > { + +template +struct functor_traits > { enum { Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, PacketAccess = internal::is_same::value && packet_traits::HasMax @@ -253,7 +265,6 @@ struct scalar_cmp_op : binary_op_base > { /** \internal * \brief Template functor to compute the pow of two scalars + * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow */ template struct scalar_pow_op : binary_op_base @@ -301,16 +313,31 @@ struct scalar_pow_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif + EIGEN_DEVICE_FUNC inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { + return generic_pow(a,b); + } }; + template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = (!NumTraits::IsComplex && !NumTraits::IsInteger && + packet_traits::HasExp && packet_traits::HasLog && + packet_traits::HasRound && packet_traits::HasCmp && + // Temporarly disable packet access for half/bfloat16 until + // accuracy is improved. + !is_same::value && !is_same::value + ) + }; }; - - //---------- non associative binary functors ---------- /** \internal @@ -337,7 +364,7 @@ struct scalar_difference_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, PacketAccess = is_same::value && packet_traits::HasSub && packet_traits::HasSub }; }; @@ -382,11 +409,14 @@ struct functor_traits > { struct scalar_boolean_and_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pand(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true }; }; @@ -398,11 +428,14 @@ template<> struct functor_traits { struct scalar_boolean_or_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::por(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true }; }; @@ -414,11 +447,44 @@ template<> struct functor_traits { struct scalar_boolean_xor_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pxor(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true + }; +}; + +/** \internal + * \brief Template functor to compute the absolute difference of two scalars + * + * \sa class CwiseBinaryOp, MatrixBase::absolute_difference + */ +template +struct scalar_absolute_difference_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op) +#else + scalar_absolute_difference_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + { return numext::absdiff(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pabsdiff(a,b); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = is_same::value && packet_traits::HasAbsDiff }; }; @@ -436,7 +502,7 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -455,7 +521,7 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/inst/include/Eigen/src/Core/functors/NullaryFunctors.h b/inst/include/Eigen/src/Core/functors/NullaryFunctors.h index b03be026..192f225d 100644 --- a/inst/include/Eigen/src/Core/functors/NullaryFunctors.h +++ b/inst/include/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,26 +37,27 @@ template struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template struct linspaced_op_impl; +template struct linspaced_op_impl; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : - m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { - typedef typename NumTraits::Real RealScalar; if(m_flip) - return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); + return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step); else - return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); + return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { // Principle: @@ -65,17 +66,17 @@ struct linspaced_op_impl { Packet pi = plset(Scalar(i-m_size1)); Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); - if(i==0) - res = pinsertfirst(res, m_low); - return res; + if (EIGEN_PREDICT_TRUE(i != 0)) return res; + Packet mask = pcmp_lt(pset1(0), plset(0)); + return pselect(mask, res, pset1(m_low)); } else { Packet pi = plset(Scalar(i)); Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); - if(i==m_size1-unpacket_traits::size+1) - res = pinsertlast(res, m_high); - return res; + if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits::size+1)) return res; + Packet mask = pcmp_lt(plset(0), pset1(unpacket_traits::size-1)); + return pselect(mask, res, pset1(m_high)); } } @@ -86,10 +87,10 @@ struct linspaced_op_impl const bool m_flip; }; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_multiplier((high-low)/convert_index(num_steps<=1 ? 1 : num_steps-1)), m_divisor(convert_index((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))), @@ -115,8 +116,8 @@ struct linspaced_op_impl // Forward declaration (we default to random access which does not really give // us a speed gain when using packet access but it allows to use the functor in // nested expressions). -template struct linspaced_op; -template struct functor_traits< linspaced_op > +template struct linspaced_op; +template struct functor_traits< linspaced_op > { enum { @@ -126,9 +127,9 @@ template struct functor_traits< linspaced IsRepeatable = true }; }; -template struct linspaced_op +template struct linspaced_op { - linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) + EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low),high,num_steps) {} @@ -136,11 +137,11 @@ template struct linspaced_op EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp(i); } // This proxy object handles the actual required temporaries and the different // implementations (integer vs. floating point). - const linspaced_op_impl::IsInteger> impl; + const linspaced_op_impl::IsInteger> impl; }; // Linear access is automatically determined from the operator() prototypes available for the given functor. @@ -166,12 +167,12 @@ struct has_unary_operator,IndexType> { enum { value = template struct has_binary_operator,IndexType> { enum { value = 1}; }; -template -struct has_nullary_operator,IndexType> { enum { value = 0}; }; -template -struct has_unary_operator,IndexType> { enum { value = 1}; }; -template -struct has_binary_operator,IndexType> { enum { value = 0}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 0}; }; +template +struct has_unary_operator,IndexType> { enum { value = 1}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; template struct has_nullary_operator,IndexType> { enum { value = 1}; }; diff --git a/inst/include/Eigen/src/Core/functors/StlFunctors.h b/inst/include/Eigen/src/Core/functors/StlFunctors.h index 9c1d7585..4570c9b6 100644 --- a/inst/include/Eigen/src/Core/functors/StlFunctors.h +++ b/inst/include/Eigen/src/Core/functors/StlFunctors.h @@ -12,6 +12,28 @@ namespace Eigen { +// Portable replacements for certain functors. +namespace numext { + +template +struct equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs == rhs; + } +}; + +template +struct not_equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs != rhs; + } +}; + +} + + namespace internal { // default functor traits for STL functors: @@ -68,11 +90,19 @@ template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; +template +struct functor_traits > + : functor_traits > {}; + template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; -#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900) +template +struct functor_traits > + : functor_traits > {}; + +#if (EIGEN_COMP_CXXVER < 11) // std::binder* are deprecated since c++11 and will be removed in c++17 template struct functor_traits > @@ -83,7 +113,7 @@ struct functor_traits > { enum { Cost = functor_traits::Cost, PacketAccess = false }; }; #endif -#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910) +#if (EIGEN_COMP_CXXVER < 17) // std::unary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > diff --git a/inst/include/Eigen/src/Core/functors/UnaryFunctors.h b/inst/include/Eigen/src/Core/functors/UnaryFunctors.h index b56e7afd..16136d18 100644 --- a/inst/include/Eigen/src/Core/functors/UnaryFunctors.h +++ b/inst/include/Eigen/src/Core/functors/UnaryFunctors.h @@ -109,7 +109,7 @@ struct functor_traits > template struct scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op) EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); } + EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); } }; @@ -117,7 +117,15 @@ template struct functor_traits > { enum { - Cost = NumTraits::IsComplex ? NumTraits::AddCost : 0, + Cost = 0, + // Yes the cost is zero even for complexes because in most cases for which + // the cost is used, conjugation turns to be a no-op. Some examples: + // cost(a*conj(b)) == cost(a*b) + // cost(a+conj(b)) == cost(a+b) + // ::HasConj }; }; @@ -130,7 +138,7 @@ struct functor_traits > template struct scalar_arg_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op) typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::parg(a); } @@ -158,6 +166,44 @@ template struct functor_traits > { enum { Cost = is_same::value ? 0 : NumTraits::AddCost, PacketAccess = false }; }; +/** \internal + * \brief Template functor to arithmetically shift a scalar right by a number of bits + * + * \sa class CwiseUnaryOp, MatrixBase::shift_right() + */ +template +struct scalar_shift_right_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const + { return a >> N; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::parithmetic_shift_right(a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; + +/** \internal + * \brief Template functor to logically shift a scalar left by a number of bits + * + * \sa class CwiseUnaryOp, MatrixBase::shift_left() + */ +template +struct scalar_shift_left_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const + { return a << N; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::plogical_shift_left(a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; + /** \internal * \brief Template functor to extract the real part of a complex * @@ -262,6 +308,26 @@ struct functor_traits > { }; }; +/** \internal + * + * \brief Template functor to compute the exponential of a scalar - 1. + * + * \sa class CwiseUnaryOp, ArrayBase::expm1() + */ +template struct scalar_expm1_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasExpm1, + Cost = functor_traits >::Cost // TODO measure cost of expm1 + }; +}; + /** \internal * * \brief Template functor to compute the logarithm of a scalar @@ -321,7 +387,7 @@ struct functor_traits > { */ template struct scalar_log10_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD_MATH(log10) return log10(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); } }; @@ -329,6 +395,22 @@ template struct functor_traits > { enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog10 }; }; +/** \internal + * + * \brief Template functor to compute the base-2 logarithm of a scalar + * + * \sa class CwiseUnaryOp, Cwise::log2() + */ +template struct scalar_log2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); } +}; +template +struct functor_traits > +{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog }; }; + /** \internal * \brief Template functor to compute the square root of a scalar * \sa class CwiseUnaryOp, Cwise::sqrt() @@ -356,13 +438,25 @@ struct functor_traits > { }; }; +// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings. +template<> struct scalar_sqrt_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; } +}; +template <> +struct functor_traits > { + enum { Cost = 1, PacketAccess = packet_traits::Vectorizable }; +}; + /** \internal * \brief Template functor to compute the reciprocal square root of a scalar * \sa class CwiseUnaryOp, Cwise::rsqrt() */ template struct scalar_rsqrt_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); } }; @@ -528,6 +622,23 @@ struct functor_traits > { }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the atanh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atanh() + */ +template +struct scalar_atanh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the sinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::sinh() @@ -547,6 +658,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the asinh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::asinh() + */ +template +struct scalar_asinh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the cosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::cosh() @@ -566,6 +694,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the acosh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::acosh() + */ +template +struct scalar_acosh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() @@ -578,9 +723,13 @@ struct scalar_inverse_op { EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const { return internal::pdiv(pset1(Scalar(1)),a); } }; -template -struct functor_traits > -{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasDiv }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasDiv, + Cost = scalar_div_cost::value + }; +}; /** \internal * \brief Template functor to compute the square of a scalar @@ -598,6 +747,19 @@ template struct functor_traits > { enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. +template<> +struct scalar_square_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return a; } +}; +template<> +struct functor_traits > +{ enum { Cost = 0, PacketAccess = packet_traits::Vectorizable }; }; + /** \internal * \brief Template functor to compute the cube of a scalar * \sa class CwiseUnaryOp, Cwise::cube() @@ -614,6 +776,19 @@ template struct functor_traits > { enum { Cost = 2*NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. +template<> +struct scalar_cube_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return a; } +}; +template<> +struct functor_traits > +{ enum { Cost = 0, PacketAccess = packet_traits::Vectorizable }; }; + /** \internal * \brief Template functor to compute the rounded value of a scalar * \sa class CwiseUnaryOp, ArrayBase::round() @@ -652,6 +827,25 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the rounded (with current rounding mode) value of a scalar + * \sa class CwiseUnaryOp, ArrayBase::rint() + */ +template struct scalar_rint_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = packet_traits::HasRint + }; +}; + /** \internal * \brief Template functor to compute the ceil of a scalar * \sa class CwiseUnaryOp, ArrayBase::ceil() @@ -678,7 +872,13 @@ struct functor_traits > template struct scalar_isnan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isnan(a); +#else + return (numext::isnan)(a); +#endif + } }; template struct functor_traits > @@ -696,7 +896,13 @@ struct functor_traits > template struct scalar_isinf_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isinf(a); +#else + return (numext::isinf)(a); +#endif + } }; template struct functor_traits > @@ -714,7 +920,13 @@ struct functor_traits > template struct scalar_isfinite_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isfinite(a); +#else + return (numext::isfinite)(a); +#endif + } }; template struct functor_traits > @@ -746,9 +958,9 @@ struct functor_traits > { * \brief Template functor to compute the signum of a scalar * \sa class CwiseUnaryOp, Cwise::sign() */ -template::IsComplex!=0) > struct scalar_sign_op; +template::IsComplex!=0), bool is_integer=(NumTraits::IsInteger!=0) > struct scalar_sign_op; template -struct scalar_sign_op { +struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { @@ -758,8 +970,21 @@ struct scalar_sign_op { //template //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } }; + template -struct scalar_sign_op { +struct scalar_sign_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + { + return (numext::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a + //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } +}; + +template +struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { @@ -777,7 +1002,7 @@ struct scalar_sign_op { template struct functor_traits > { enum { - Cost = + Cost = NumTraits::IsComplex ? ( 8*NumTraits::MulCost ) // roughly : ( 3*NumTraits::AddCost), @@ -785,6 +1010,120 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the logistic function of a scalar + * \sa class CwiseUnaryOp, ArrayBase::logistic() + */ +template +struct scalar_logistic_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + return packetOp(x); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + +#ifndef EIGEN_GPU_COMPILE_PHASE +/** \internal + * \brief Template specialization of the logistic function for float. + * + * Uses just a 9/10-degree rational interpolant which + * interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range + * [-9, 18]. Below -9 we use the more accurate approximation + * 1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing + * one ulp. The shifted logistic is interpolated because it was easier to + * make the fit converge. + * + */ +template <> +struct scalar_logistic_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { + return packetOp(x); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& _x) const { + const Packet cutoff_lower = pset1(-9.f); + const Packet lt_mask = pcmp_lt(_x, cutoff_lower); + const bool any_small = predux_any(lt_mask); + + // The upper cut-off is the smallest x for which the rational approximation evaluates to 1. + // Choosing this value saves us a few instructions clamping the results at the end. +#ifdef EIGEN_VECTORIZE_FMA + const Packet cutoff_upper = pset1(15.7243833541870117f); +#else + const Packet cutoff_upper = pset1(15.6437711715698242f); +#endif + const Packet x = pmin(_x, cutoff_upper); + + // The monomial coefficients of the numerator polynomial (odd). + const Packet alpha_1 = pset1(2.48287947061529e-01f); + const Packet alpha_3 = pset1(8.51377133304701e-03f); + const Packet alpha_5 = pset1(6.08574864600143e-05f); + const Packet alpha_7 = pset1(1.15627324459942e-07f); + const Packet alpha_9 = pset1(4.37031012579801e-11f); + + // The monomial coefficients of the denominator polynomial (even). + const Packet beta_0 = pset1(9.93151921023180e-01f); + const Packet beta_2 = pset1(1.16817656904453e-01f); + const Packet beta_4 = pset1(1.70198817374094e-03f); + const Packet beta_6 = pset1(6.29106785017040e-06f); + const Packet beta_8 = pset1(5.76102136993427e-09f); + const Packet beta_10 = pset1(6.10247389755681e-13f); + + // Since the polynomials are odd/even, we need x^2. + const Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet p = pmadd(x2, alpha_9, alpha_7); + p = pmadd(x2, p, alpha_5); + p = pmadd(x2, p, alpha_3); + p = pmadd(x2, p, alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial q. + Packet q = pmadd(x2, beta_10, beta_8); + q = pmadd(x2, q, beta_6); + q = pmadd(x2, q, beta_4); + q = pmadd(x2, q, beta_2); + q = pmadd(x2, q, beta_0); + // Divide the numerator by the denominator and shift it up. + const Packet logistic = padd(pdiv(p, q), pset1(0.5f)); + if (EIGEN_PREDICT_FALSE(any_small)) { + const Packet exponential = pexp(_x); + return pselect(lt_mask, exponential, logistic); + } else { + return logistic; + } + } +}; +#endif // #ifndef EIGEN_GPU_COMPILE_PHASE + +template +struct functor_traits > { + enum { + // The cost estimate for float here here is for the common(?) case where + // all arguments are greater than -9. + Cost = scalar_div_cost::HasDiv>::value + + (internal::is_same::value + ? NumTraits::AddCost * 15 + NumTraits::MulCost * 11 + : NumTraits::AddCost * 2 + + functor_traits >::Cost), + PacketAccess = + packet_traits::HasAdd && packet_traits::HasDiv && + (internal::is_same::value + ? packet_traits::HasMul && packet_traits::HasMax && + packet_traits::HasMin + : packet_traits::HasNegate && packet_traits::HasExp) + }; +}; + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 681451cc..f35b760c 100644 --- a/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template +enum GEBPPacketSizeType { + GEBPPacketFull = 0, + GEBPPacketHalf, + GEBPPacketQuarter +}; + +template class gebp_traits; @@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff return a<=0 ? b : a; } +#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) + #if EIGEN_ARCH_i386_OR_x86_64 -const std::ptrdiff_t defaultL1CacheSize = 32*1024; -const std::ptrdiff_t defaultL2CacheSize = 256*1024; -const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024); +#elif EIGEN_ARCH_PPC +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024); #else -const std::ptrdiff_t defaultL1CacheSize = 16*1024; -const std::ptrdiff_t defaultL2CacheSize = 512*1024; -const std::ptrdiff_t defaultL3CacheSize = 512*1024; +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024); #endif +#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE + /** \internal */ struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { @@ -50,7 +82,6 @@ struct CacheSizes { std::ptrdiff_t m_l3; }; - /** \internal */ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { @@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -308,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ computeProductBlockingSizes(k, m, n, num_threads); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD - #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); -#else - - // FIXME (a bit overkill maybe ?) - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) - { - c = cj.pmadd(a,b,c); - } - }; - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) - { - t = b; t = cj.pmul(a,t); c = padd(c,t); - } - }; +template +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional=4, RhsPacketx4, RhsPacket>::type type; +}; - template - EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) - { - gebp_madd_selector::run(cj,a,b,c,t); - } +template +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; - #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); -#endif +template +struct packet_conditional { typedef T3 type; }; + +template +struct packet_conditional { typedef T1 type; }; + +template +struct packet_conditional { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + ScalarPacket /* Vectorization logic * real*real: unpack rhs to constant packets, ... @@ -348,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -356,13 +422,17 @@ class gebp_traits typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -371,10 +441,12 @@ class gebp_traits // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) - // we assume 16 registers +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) + // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, @@ -384,37 +456,41 @@ class gebp_traits RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); @@ -432,8 +508,8 @@ class gebp_traits dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -448,6 +524,12 @@ class gebp_traits #endif } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -461,21 +543,25 @@ class gebp_traits }; -template -class gebp_traits, RealScalar, _ConjLhs, false> +template +class gebp_traits, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -490,13 +576,12 @@ class gebp_traits, RealScalar, _ConjLhs, false> RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; @@ -505,42 +590,64 @@ class gebp_traits, RealScalar, _ConjLhs, false> p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - dest = pset1(*b); + loadRhsQuad_impl(b,dest, typename conditional::type()); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { - dest = pload(a); + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad(tmp); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { - dest = ploadu(a); + eigen_internal_assert(RhsPacketSize<=8); + dest = pset1(*b); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + dest = pload(a); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu(a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -555,13 +662,20 @@ class gebp_traits, RealScalar, _ConjLhs, false> c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const + { + conj_helper cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper cj; }; template @@ -580,13 +694,57 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template -const DoublePacket& predux_downto4(const DoublePacket &a) +const DoublePacket& +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size<=8>::type* = 0) { return a; } -template struct unpacket_traits > { typedef DoublePacket half; }; +template +DoublePacket::half> +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket::half> res; + typedef std::complex::type> Cplx; + typedef typename packet_traits::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size<=8>::type* = 0) +{ + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); +} + +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits::Real RealScalar; + RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])}; + RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])}; + dest.first = ploadquad(r); + dest.second = ploadquad(i); +} + + +template struct unpacket_traits > { + typedef DoublePacket::half> half; +}; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) // { @@ -596,8 +754,8 @@ template struct unpacket_traits > { typede // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -605,15 +763,21 @@ class gebp_traits, std::complex, _ConjLhs, typedef std::complex RhsScalar; typedef std::complex ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits::vectorizable + && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -623,14 +787,16 @@ class gebp_traits, std::complex, _ConjLhs, RhsProgress = 1 }; - typedef typename packet_traits::type RealPacket; - typedef typename packet_traits::type ScalarPacket; - typedef DoublePacket DoublePacketType; + typedef DoublePacket DoublePacketType; + typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -641,51 +807,49 @@ class gebp_traits, std::complex, _ConjLhs, } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pset1(numext::real(*b)); - dest.second = pset1(numext::imag(*b)); + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b,dest); + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadRhs(b,dest); } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -694,47 +858,59 @@ class gebp_traits, std::complex, _ConjLhs, dest = pload((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); + dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE + typename enable_if::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -744,8 +920,8 @@ class gebp_traits, std::complex, _ConjLhs, conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs > +template +class gebp_traits, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -753,14 +929,25 @@ class gebp_traits, false, _ConjRhs > typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -771,14 +958,11 @@ class gebp_traits, false, _ConjRhs > RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -786,22 +970,25 @@ class gebp_traits, false, _ConjRhs > p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1(b+0); -// b1 = pload1(b+1); -// } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -810,21 +997,23 @@ class gebp_traits, false, _ConjRhs > EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup(a); + dest = ploaddup(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -840,16 +1029,24 @@ class gebp_traits, false, _ConjRhs > c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper cj; + }; -/* optimized GEneral packed Block * packed Panel product kernel +/* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments @@ -859,26 +1056,47 @@ class gebp_traits, false, _ConjRhs > template struct gebp_kernel { - typedef gebp_traits Traits; + typedef gebp_traits Traits; + typedef gebp_traits HalfTraits; + typedef gebp_traits QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper::type RhsPanel15; + + typedef gebp_traits SwappedTraits; - typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -888,6 +1106,299 @@ struct gebp_kernel Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; +template::LhsProgress> +struct last_row_process_16_packets +{ + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + EIGEN_UNUSED_VARIABLE(res); + EIGEN_UNUSED_VARIABLE(straits); + EIGEN_UNUSED_VARIABLE(blA); + EIGEN_UNUSED_VARIABLE(blB); + EIGEN_UNUSED_VARIABLE(depth); + EIGEN_UNUSED_VARIABLE(endk); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j2); + EIGEN_UNUSED_VARIABLE(alpha); + EIGEN_UNUSED_VARIABLE(C0); + } +}; + + +template +struct last_row_process_16_packets { + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + typedef typename unpacket_traits::half>::half SResPacketQuarter; + typedef typename unpacket_traits::half>::half SLhsPacketQuarter; + typedef typename unpacket_traits::half>::half SRhsPacketQuarter; + typedef typename unpacket_traits::half>::half SAccPacketQuarter; + + SResPacketQuarter R = res.template gatherPacket(i, j2); + SResPacketQuarter alphav = pset1(alpha); + + if (depth - endk > 0) + { + // We have to handle the last row(s) of the rhs, which + // correspond to a half-packet + SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0)); + + for (Index kk = endk; kk < depth; kk++) + { + SLhsPacketQuarter a0; + SRhsPacketQuarter b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + straits.madd(a0,b0,c0,b0, fix<0>); + blB += SwappedTraits::LhsProgress/4; + blA += 1; + } + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R); + } + res.scatterPacket(i, j2, R); + } +}; + +template +struct lhs_process_one_packet +{ + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i(alpha); + + R0 = r0.template loadPacket(0); + R1 = r1.template loadPacket(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket(0); + R1 = r3.template loadPacket(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k(alpha); + R0 = r0.template loadPacket(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template +struct lhs_process_fraction_of_packet : lhs_process_one_packet +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template EIGEN_DONT_INLINE void gebp_kernel @@ -904,10 +1415,12 @@ void gebp_kernel=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -965,36 +1478,48 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1014,7 +1539,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1036,9 +1562,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1046,9 +1572,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1056,9 +1582,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1094,20 +1620,20 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1117,8 +1643,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1196,7 +1722,8 @@ void gebp_kernel=6 without FMA (bug 1637) @@ -1205,24 +1732,24 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1242,7 +1769,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(0 * Traits::ResPacketSize); - R3 = r1.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1265,10 +1793,10 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -1313,8 +1841,8 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1327,8 +1855,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -1358,186 +1886,43 @@ void gebp_kernel=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i(alpha); - - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1::half>::size; + const int SResPacketQuarterSize = unpacket_traits::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && - (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) + (SwappedTraits::LhsProgress<=16) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); @@ -1560,15 +1945,15 @@ void gebp_kernel); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1581,7 +1966,7 @@ void gebp_kernel); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1591,7 +1976,7 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); @@ -1604,16 +1989,25 @@ void gebp_kernel); straits.acc(c0, alphav, R); } else { - straits.acc(predux_downto4(C0), alphav, R); + straits.acc(predux_half_dowto4(C0), alphav, R); } res.scatterPacket(i, j2, R); } + else if (SwappedTraits::LhsProgress==16) + { + // Special case where we have to first reduce the + // accumulation register C0. We specialize the block in + // template form, so that LhsProgress < 16 paths don't + // fail to compile + last_row_process_16_packets p; + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + } else { SResPacket R = res.template gatherPacket(i, j2); @@ -1636,14 +2030,14 @@ void gebp_kernel -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1717,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -1733,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); + C = lhs.template loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1753,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1771,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); } } - // Pack scalars + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) + { + for(; i(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); + } + } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). if(Pack21) { - for(; i -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1821,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i kernel; @@ -1979,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1%PacketSize] = dm1.loadPacket(k); - kernel.packet[2%PacketSize] = dm2.loadPacket(k); - kernel.packet[3%PacketSize] = dm3.loadPacket(k); + kernel.packet[0 ] = dm0.template loadPacket(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2023,94 +2501,104 @@ template { typedef typename packet_traits::type Packet; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; typedef typename DataMapper::LinearMapper LinearMapper; - enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); -}; - -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) -{ - EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); - EIGEN_UNUSED_VARIABLE(stride); - EIGEN_UNUSED_VARIABLE(offset); - eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - conj_if::IsComplex && Conjugate> cj; - Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; - Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; - Index count = 0; - -// if(nr>=8) -// { -// for(Index j2=0; j2(&rhs[k*rhsStride + j2]); -// pstoreu(blockB+count, cj.pconj(A)); -// } else if (PacketSize==4) { -// Packet A = ploadu(&rhs[k*rhsStride + j2]); -// Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); -// pstoreu(blockB+count, cj.pconj(A)); -// pstoreu(blockB+count+PacketSize, cj.pconj(B)); -// } else { -// const Scalar* b0 = &rhs[k*rhsStride + j2]; -// blockB[count+0] = cj(b0[0]); -// blockB[count+1] = cj(b0[1]); -// blockB[count+2] = cj(b0[2]); -// blockB[count+3] = cj(b0[3]); -// blockB[count+4] = cj(b0[4]); -// blockB[count+5] = cj(b0[5]); -// blockB[count+6] = cj(b0[6]); -// blockB[count+7] = cj(b0[7]); -// } -// count += 8; -// } -// // skip what we have after -// if(PanelMode) count += 8 * (stride-offset-depth); -// } -// } - if(nr>=4) + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size}; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) { - for(Index j2=packet_cols8; j2=depth && offset<=stride)); + const bool HasHalf = (int)HalfPacketSize < (int)PacketSize; + const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize; + conj_if::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + + // if(nr>=8) + // { + // for(Index j2=0; j2(&rhs[k*rhsStride + j2]); + // pstoreu(blockB+count, cj.pconj(A)); + // } else if (PacketSize==4) { + // Packet A = ploadu(&rhs[k*rhsStride + j2]); + // Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); + // pstoreu(blockB+count, cj.pconj(A)); + // pstoreu(blockB+count+PacketSize, cj.pconj(B)); + // } else { + // const Scalar* b0 = &rhs[k*rhsStride + j2]; + // blockB[count+0] = cj(b0[0]); + // blockB[count+1] = cj(b0[1]); + // blockB[count+2] = cj(b0[2]); + // blockB[count+3] = cj(b0[3]); + // blockB[count+4] = cj(b0[4]); + // blockB[count+5] = cj(b0[5]); + // blockB[count+6] = cj(b0[6]); + // blockB[count+7] = cj(b0[7]); + // } + // count += 8; + // } + // // skip what we have after + // if(PanelMode) count += 8 * (stride-offset-depth); + // } + // } + if(nr>=4) { - // skip what we have before - if(PanelMode) count += 4 * offset; - for(Index k=0; k(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else if (HasHalf && HalfPacketSize==4) { + HalfPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += HalfPacketSize; + } else if (HasQuarter && QuarterPacketSize==4) { + QuarterPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += QuarterPacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); + count += 4; + } } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += 4 * (stride-offset-depth); } - } - // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols4; j2 pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; @@ -110,7 +110,7 @@ static void run(Index rows, Index cols, Index depth, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} - info[tid].users += threads; + info[tid].users = threads; pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); @@ -148,7 +148,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 for(Index i=0; i template static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { @@ -441,7 +449,7 @@ struct generic_product_impl template static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); @@ -450,7 +458,7 @@ struct generic_product_impl template static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); @@ -463,11 +471,25 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; + if (dst.cols() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs); typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index d68d2f96..6ba0d9bd 100644 --- a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -87,7 +87,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; tribb_kernel sybb; @@ -302,13 +302,13 @@ struct general_product_to_triangular_selector template template -TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 691f95d6..9a650ec2 100644 --- a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -37,7 +37,7 @@ namespace Eigen { namespace internal { -template +template struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {}; diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h b/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h index a597c1f4..dfb6aebc 100644 --- a/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2009 Gael Guennebaud +// Copyright (C) 2008-2016 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,11 +14,57 @@ namespace Eigen { namespace internal { +enum GEMVPacketSizeType { + GEMVPacketFull = 0, + GEMVPacketHalf, + GEMVPacketQuarter +}; + +template +struct gemv_packet_cond { typedef T3 type; }; + +template +struct gemv_packet_cond { typedef T1 type; }; + +template +struct gemv_packet_cond { typedef T2 type; }; + +template +class gemv_traits +{ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename gemv_packet_cond::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); +#undef PACKET_DECL_COND_PREFIX + +public: + enum { + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && + unpacket_traits<_RhsPacket>::vectorizable && + int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 + }; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; +}; + + /* Optimized col-major matrix * vector product: - * This algorithm processes 4 columns at onces that allows to both reduce - * the number of load/stores of the result by a factor 4 and to reduce - * the instruction dependency. Moreover, we know that all bands have the - * same alignment pattern. + * This algorithm processes the matrix per vertical panels, + * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments @@ -27,56 +73,30 @@ namespace internal { * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul * - * Accesses to the matrix coefficients follow the following logic: - * - * - if all columns have the same alignment then - * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise - * - if even columns have the same alignment then - * // odd columns are guaranteed to have the same alignment too - * - if even or odd columns have the same alignment as the result, then - * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) - * - perform half aligned and half unaligned loads (-> EvenAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then - * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, - * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) - * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h - * - otherwise, - * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), - * // we currently fall back to the NoneAligned case - * * The same reasoning apply for the transposed case. - * - * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... - * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment - * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow - * compared to unaligned loads on a 4 byte boundary. - * */ template struct general_matrix_vector_product { + typedef gemv_traits Traits; + typedef gemv_traits HalfTraits; + typedef gemv_traits QuarterTraits; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -85,244 +105,187 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ - pstore(&res[j], \ - padd(pload(&res[j]), \ - padd( \ - padd(pcj.pmul(lhs0.template load(j), ptmp0), \ - pcj.pmul(lhs1.template load(j), ptmp1)), \ - padd(pcj.pmul(lhs2.template load(j), ptmp2), \ - pcj.pmul(lhs3.template load(j), ptmp3)) ))) - - typedef typename LhsMapper::VectorMapper LhsScalars; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); conj_helper cj; conj_helper pcj; - if(ConjugateRhs) - alpha = numext::conj(alpha); - - enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; - const Index columnsAtOnce = 4; - const Index peels = 2; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index ResPacketAlignedMask = ResPacketSize-1; -// const Index PeelAlignedMask = ResPacketSize*peels-1; - const Index size = rows; + conj_helper pcj_half; + conj_helper pcj_quarter; const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_default_aligned(res,size); - Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(size); - - // find how many columns do we have to skip to be aligned with the result (if possible) - Index skipColumns = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - // Currently, it seems to be better to perform unaligned loads anyway - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + const Index n8 = rows-8*ResPacketSize+1; + const Index n4 = rows-4*ResPacketSize+1; + const Index n3 = rows-3*ResPacketSize+1; + const Index n2 = rows-2*ResPacketSize+1; + const Index n1 = rows-1*ResPacketSize+1; + const Index n_half = rows-1*ResPacketSizeHalf+1; + const Index n_quarter = rows-1*ResPacketSizeQuarter+1; + + // TODO: improve the following heuristic: + const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); + ResPacket palpha = pset1(alpha); + ResPacketHalf palpha_half = pset1(alpha); + ResPacketQuarter palpha_quarter = pset1(alpha); + + for(Index j2=0; j2(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+LhsPacketSize*4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+LhsPacketSize*5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+LhsPacketSize*6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+LhsPacketSize*7,j),b0,c7); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); + pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu(res+i+ResPacketSize*4))); + pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu(res+i+ResPacketSize*5))); + pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu(res+i+ResPacketSize*6))); + pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu(res+i+ResPacketSize*7))); } - else + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); - /* eigen_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= cols) - || LhsPacketSize > size - || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ - } - else if(Vectorizable) - { - alignedStart = 0; - alignedSize = size; - alignmentPattern = AllAligned; - } - - const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); - Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; - for (Index i=skipColumns; i(alpha*rhs(i, 0)), - ptmp1 = pset1(alpha*rhs(i+offset1, 0)), - ptmp2 = pset1(alpha*rhs(i+2, 0)), - ptmp3 = pset1(alpha*rhs(i+offset3, 0)); + i+=ResPacketSize*4; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)); - // this helps a lot generating better binary code - const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), - lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); - if (Vectorizable) + i+=ResPacketSize*3; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); } - - if (alignedSize>alignedStart) + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + i+=ResPacketSize*2; + } + if(i(ResScalar(0)); + for(Index j=j2; j1) - { - LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - ResPacket T0, T1; - - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - A00 = lhs0.template load(j); - A10 = lhs0.template load(j+LhsPacketSize); - T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); - T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); - - T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - T0 = pcj.pmadd(A03, ptmp3, T0); - pstore(&res[j],T0); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - T1 = pcj.pmadd(A11, ptmp1, T1); - T1 = pcj.pmadd(A12, ptmp2, T1); - T1 = pcj.pmadd(A13, ptmp3, T1); - pstore(&res[j+ResPacketSize],T1); - } - } - for (; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); } - } // end explicit vectorization - - /* process remaining coeffs (or all if there is no explicit vectorization) */ - for (Index j=alignedSize; j(res+i+ResPacketSize*0))); + i+=ResPacketSize; + } + if(HasHalf && i(ResScalar(0)); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj_half.pmadd(lhs.template load(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu(res+i+ResPacketSizeHalf*0))); + i+=ResPacketSizeHalf; } - } - - // process remaining first and last columns (at most columnsAtOnce-1) - Index end = cols; - Index start = columnBound; - do - { - for (Index k=start; k(alpha*rhs(k, 0)); - const LhsScalars lhs0 = lhs.getVectorMapper(0, k); - - if (Vectorizable) + ResPacketQuarter c0 = pset1(ResScalar(0)); + for(Index j=j2; j(alignedStart)) - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - else - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); + RhsPacketQuarter b0 = pset1(rhs(j,0)); + c0 = pcj_quarter.pmadd(lhs.template load(i+0,j),b0,c0); } - - // process remaining scalars (or all if no explicit vectorization) - for (Index i=alignedSize; i(res+i+ResPacketSizeQuarter*0))); + i+=ResPacketSizeQuarter; } - if (skipColumns) + for(;i struct general_matrix_vector_product { -typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; + typedef gemv_traits Traits; + typedef gemv_traits HalfTraits; + typedef gemv_traits QuarterTraits; + + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -361,255 +324,191 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - eigen_internal_assert(rhs.stride()==1); - - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ - RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ - ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ - ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ - ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); + eigen_internal_assert(rhs.stride()==1); conj_helper cj; conj_helper pcj; - - typedef typename LhsMapper::VectorMapper LhsScalars; - - enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; - const Index rowsAtOnce = 4; - const Index peels = 2; - const Index RhsPacketAlignedMask = RhsPacketSize-1; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index depth = cols; - const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type - // if that's not the case then vectorization is discarded, see below. - Index alignedStart = rhs.firstAligned(depth); - Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(depth); - const Index rhsAlignmentOffset = rhs.firstAligned(rows); - - // find how many rows do we have to skip to be aligned with rhs (if possible) - Index skipRows = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || - (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || - (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) + conj_helper pcj_half; + conj_helper pcj_quarter; + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + LhsPacketSizeHalf = HalfTraits::LhsPacketSize, + LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + Index i=0; + for(; i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - // nothing can be aligned, no need to skip any column - alignmentPattern = NoneAligned; - skipRows = 0; + RhsPacket b0 = rhs.template load(j,0); + + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+7,j),b0,c7); } - else + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + ResScalar cc4 = predux(c4); + ResScalar cc5 = predux(c5); + ResScalar cc6 = predux(c6); + ResScalar cc7 = predux(c7); + for(; j= rows) - || LhsPacketSize > depth - || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + res[(i+4)*resIncr] += alpha*cc4; + res[(i+5)*resIncr] += alpha*cc5; + res[(i+6)*resIncr] += alpha*cc6; + res[(i+7)*resIncr] += alpha*cc7; } - else if(Vectorizable) + for(; i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); - Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; - for (Index i=skipRows; i(j,0); - if (Vectorizable) + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + for(; j(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), - ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); + RhsScalar b0 = rhs(j,0); - // process initial unaligned coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; j(ResScalar(0)), + c1 = pset1(ResScalar(0)); - if (alignedSize>alignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j1) - { - /* Here we proccess 4 rows with with two peeled iterations to hide - * the overhead of unaligned loads. Moreover unaligned loads are handled - * using special shift/move operations between the two aligned packets - * overlaping the desired unaligned packet. This is *much* more efficient - * than basic unaligned loads. - */ - LhsPacket A01, A02, A03, A11, A12, A13; - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(0); - A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); - ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - - b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); - ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); - ptmp1 = pcj.pmadd(A11, b, ptmp1); - ptmp2 = pcj.pmadd(A12, b, ptmp2); - ptmp3 = pcj.pmadd(A13, b, ptmp3); - } - } - for (; j(j,0); - // process remaining coeffs (or all if no explicit vectorization) - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + for(; j(ResScalar(0)); + ResPacketHalf c0_h = pset1(ResScalar(0)); + ResPacketQuarter c0_q = pset1(ResScalar(0)); + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0); - ResPacket ptmp0 = pset1(tmp0); - const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); - // process first unaligned result's coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; jalignedStart) - { - // process aligned rhs coeffs - if (lhs0.template aligned(alignedStart)) - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - else - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - tmp0 += predux(ptmp0); - } - - // process remaining scalars - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j(j,0); + c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); } - if (skipRows) + ResScalar cc0 = predux(c0); + if (HasHalf) { + for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + { + RhsPacketHalf b0 = rhs.template load(j,0); + c0_h = pcj_half.pmadd(lhs.template load(i,j),b0,c0_h); + } + cc0 += predux(c0_h); + } + if (HasQuarter) { + for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter) + { + RhsPacketQuarter b0 = rhs.template load(j,0); + c0_q = pcj_quarter.pmadd(lhs.template load(i,j),b0,c0_q); + } + cc0 += predux(c0_q); + } + for(; j +#endif + namespace Eigen { namespace internal { @@ -18,7 +22,7 @@ namespace internal { inline void manage_multi_threading(Action action, int* v) { static int m_maxThreads = -1; - EIGEN_UNUSED_VARIABLE(m_maxThreads); + EIGEN_UNUSED_VARIABLE(m_maxThreads) if(action==SetAction) { @@ -76,8 +80,17 @@ template struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} + // volatile is not enough on all architectures (see bug 1572) + // to guarantee that when thread A says to thread B that it is + // done with packing a block, then all writes have been really + // carried out... C++11 memory model+atomic guarantees this. +#if EIGEN_HAS_CXX11_ATOMIC + std::atomic sync; + std::atomic users; +#else Index volatile sync; int volatile users; +#endif Index lhs_start; Index lhs_length; @@ -88,11 +101,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, { // TODO when EIGEN_USE_BLAS is defined, // we should still enable OMP for other scalar types -#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS) + // Without C++11, we have to disable GEMM's parallelization on + // non x86 architectures because there volatile is not enough for our purpose. + // See bug 1572. +#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) // FIXME the transpose variable is only needed to properly split // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole - // parallelizer mechanism has to be redisigned anyway. + // parallelizer mechanism has to be redesigned anyway. EIGEN_UNUSED_VARIABLE(depth); EIGEN_UNUSED_VARIABLE(transpose); func(0,rows, 0,cols); @@ -113,12 +129,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, double work = static_cast(rows) * static_cast(cols) * static_cast(depth); double kMinTaskSize = 50000; // FIXME improve this heuristic. - pb_max_threads = std::max(1, std::min(pb_max_threads, work / kMinTaskSize)); + pb_max_threads = std::max(1, std::min(pb_max_threads, static_cast( work / kMinTaskSize ) )); // compute the number of threads we are going to use Index threads = std::min(nbThreads(), pb_max_threads); - // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) @@ -151,10 +167,8 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, info[i].lhs_start = r0; info[i].lhs_length = actualBlockRows; - if(transpose) - func(c0, actualBlockCols, 0, rows, info); - else - func(0, rows, c0, actualBlockCols, info); + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); } #endif } diff --git a/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 04c93348..33ecf10f 100644 --- a/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,14 +45,23 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::type>::half HalfPacket; + typedef typename unpacket_traits::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -442,7 +459,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product enum { LhsUpLo = LhsMode&(Upper|Lower) }; template - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/inst/include/Eigen/src/Core/products/SelfadjointProduct.h b/inst/include/Eigen/src/Core/products/SelfadjointProduct.h index ef12c98f..a21be805 100644 --- a/inst/include/Eigen/src/Core/products/SelfadjointProduct.h +++ b/inst/include/Eigen/src/Core/products/SelfadjointProduct.h @@ -111,7 +111,7 @@ struct selfadjoint_product_selector Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo> ::run(size, depth, - &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), + actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(), mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; @@ -120,7 +120,7 @@ struct selfadjoint_product_selector template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const Scalar& alpha) { selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h b/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h index 2ae36411..f752a0bf 100644 --- a/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template struct selfadjoint_rank2_update_selector { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i struct conj_expr_if template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const Scalar& alpha) { typedef internal::blas_traits UBlasTraits; @@ -79,8 +80,8 @@ ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const if (IsRowMajor) actualAlpha = numext::conj(actualAlpha); - typedef typename internal::remove_all::type>::type UType; - typedef typename internal::remove_all::type>::type VType; + typedef typename internal::remove_all::type>::type UType; + typedef typename internal::remove_all::type>::type VType; internal::selfadjoint_rank2_update_selector ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); diff --git a/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h b/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h index 2fb408d1..f0c60507 100644 --- a/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -155,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -226,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -305,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; diff --git a/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h b/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h index e3ed2cd1..6d879ba0 100644 --- a/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -136,7 +136,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k20) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - if(!(Mode & UnitDiag)) + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) rhs[i] /= cjLhs(i,i); } } @@ -114,20 +114,23 @@ struct triangular_solve_vector0) - Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + if(numext::not_equal_strict(rhs[i],RhsScalar(0))) + { + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs.coeff(i,i); + + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime general_matrix_vector_product::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), diff --git a/inst/include/Eigen/src/Core/util/BlasUtil.h b/inst/include/Eigen/src/Core/util/BlasUtil.h index 3dff9bc9..e16a5649 100755 --- a/inst/include/Eigen/src/Core/util/BlasUtil.h +++ b/inst/include/Eigen/src/Core/util/BlasUtil.h @@ -24,7 +24,7 @@ struct gebp_kernel; template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -39,90 +39,6 @@ template struct general_matrix_vector_product; - -template struct conj_if; - -template<> struct conj_if { - template - inline T operator()(const T& x) const { return numext::conj(x); } - template - inline T pconj(const T& x) const { return internal::pconj(x); } -}; - -template<> struct conj_if { - template - inline const T& operator()(const T& x) const { return x; } - template - inline const T& pconj(const T& x) const { return x; } -}; - -// Generic implementation for custom complex types. -template -struct conj_helper -{ - typedef typename ScalarBinaryOpTraits::ReturnType Scalar; - - EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const - { return conj_if()(x) * conj_if()(y); } -}; - -template struct conj_helper -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); } -}; - -template struct conj_helper, std::complex, false,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); } -}; - -template struct conj_helper, std::complex, true,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, std::complex, true,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, RealScalar, Conj,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const - { return conj_if()(x)*y; } -}; - -template struct conj_helper, false,Conj> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const - { return x*conj_if()(y); } -}; - template struct get_factor { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); } }; @@ -159,11 +75,9 @@ template class BlasLinearMapper; template -class BlasLinearMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class BlasLinearMapper +{ +public: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1) : m_data(data) { @@ -179,19 +93,17 @@ class BlasLinearMapper { return m_data[i]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return ploadt(m_data + i); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return ploadt(m_data + i); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { - pstoret(m_data + i, p); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pstoret(m_data + i, p); } - protected: +protected: Scalar *m_data; }; @@ -199,13 +111,59 @@ class BlasLinearMapper { template class blas_data_mapper; +// TMP to help PacketBlock store implementation. +// There's currently no known use case for PacketBlock load. +// The default implementation assumes ColMajor order. +// It always store each packet sequentially one `stride` apart. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + i + (j + idx)*stride, block.packet[idx]); + } +}; + +// PacketBlockManagement specialization to take care of RowMajor order without ifs. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + j + (i + idx)*stride, block.packet[idx]); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + template class blas_data_mapper { public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; @@ -235,12 +193,14 @@ class blas_data_mapper return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { + return ploadt(&operator()(i, j)); } template @@ -263,7 +223,12 @@ class blas_data_mapper return internal::first_default_aligned(m_data, size); } - protected: + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock &block) const { + PacketBlockManagement pbm; + pbm.store(m_data, m_stride, i, j, block); + } +protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; }; @@ -275,9 +240,6 @@ template class BlasLinearMapper { public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { @@ -288,8 +250,9 @@ class BlasLinearMapper return m_data[i*m_incr.value()]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return pgather(m_data + i*m_incr.value(), m_incr.value()); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return pgather(m_data + i*m_incr.value(), m_incr.value()); } template @@ -306,9 +269,6 @@ template::type Packet; - typedef typename packet_traits::half HalfPacket; - typedef BlasLinearMapper LinearMapper; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {} @@ -327,8 +287,9 @@ class blas_data_mapper return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return pgather(&operator()(i, j),m_incr.value()); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); } template @@ -346,6 +307,77 @@ class blas_data_mapper return pgather(&operator()(i, j), m_stride); } + // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the Complex types. + template + struct storePacketBlock_helper + { + storePacketBlock_helper spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup, i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + ScalarT *v = &sup->operator()(i+l, j+idx); + *v = block.packet[idx][l]; + } + } + }; + + template + struct storePacketBlock_helper, n, idx> + { + storePacketBlock_helper, n, idx-1> spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup,i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + std::complex *v = &sup->operator()(i+l, j+idx); + v->real(block.packet[idx].v[2*l+0]); + v->imag(block.packet[idx].v[2*l+1]); + } + } + }; + + template + struct storePacketBlock_helper, n, idx> + { + storePacketBlock_helper, n, idx-1> spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup,i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + std::complex *v = &sup->operator()(i+l, j+idx); + v->real(block.packet[idx].v[2*l+0]); + v->imag(block.packet[idx].v[2*l+1]); + } + } + }; + + template + struct storePacketBlock_helper + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + + template + struct storePacketBlock_helper, n, -1> + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + + template + struct storePacketBlock_helper, n, -1> + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + // This function stores a PacketBlock on m_data, this approach is really quite slow compare to Incr=1 and should be avoided when possible. + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock&block) const { + storePacketBlock_helper spb; + spb.store(this, i,j,block); + } protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; @@ -379,14 +411,15 @@ template struct blas_traits HasUsableDirectAccess = ( (int(XprType::Flags)&DirectAccessBit) && ( bool(XprType::IsVectorAtCompileTime) || int(inner_stride_at_compile_time::ret) == 1) - ) ? 1 : 0 + ) ? 1 : 0, + HasScalarFactor = false }; typedef typename conditional::type DirectLinearAccessType; - static inline ExtractType extract(const XprType& x) { return x; } - static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } + static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -411,17 +444,23 @@ template struct blas_traits, const CwiseNullaryOp,Plain>, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline Scalar extractScalarFactor(const XprType& x) + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template struct blas_traits, NestedXpr, const CwiseNullaryOp,Plain> > > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, NestedXpr, const CwiseNullaryOp,Plain> > XprType; typedef typename Base::ExtractType ExtractType; @@ -440,6 +479,9 @@ template struct blas_traits, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseUnaryOp, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; @@ -476,7 +518,7 @@ struct blas_traits template::HasUsableDirectAccess> struct extract_data_selector { - static const typename T::Scalar* run(const T& m) + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) { return blas_traits::extract(m).data(); } @@ -487,11 +529,53 @@ struct extract_data_selector { static typename T::Scalar* run(const T&) { return 0; } }; -template const typename T::Scalar* extract_data(const T& m) +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) { return extract_data_selector::run(m); } +/** + * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products. + * There is a specialization for booleans + */ +template +struct combine_scalar_factors_impl +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) + { + return blas_traits::extractScalarFactor(lhs) * blas_traits::extractScalarFactor(rhs); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) + { + return alpha * blas_traits::extractScalarFactor(lhs) * blas_traits::extractScalarFactor(rhs); + } +}; +template +struct combine_scalar_factors_impl +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) + { + return blas_traits::extractScalarFactor(lhs) && blas_traits::extractScalarFactor(rhs); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) + { + return alpha && blas_traits::extractScalarFactor(lhs) && blas_traits::extractScalarFactor(rhs); + } +}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) +{ + return combine_scalar_factors_impl::run(alpha, lhs, rhs); +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) +{ + return combine_scalar_factors_impl::run(lhs, rhs); +} + + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Core/util/ConfigureVectorization.h b/inst/include/Eigen/src/Core/util/ConfigureVectorization.h new file mode 100644 index 00000000..af4e6962 --- /dev/null +++ b/inst/include/Eigen/src/Core/util/ConfigureVectorization.h @@ -0,0 +1,512 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2018 Gael Guennebaud +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CONFIGURE_VECTORIZATION_H +#define EIGEN_CONFIGURE_VECTORIZATION_H + +//------------------------------------------------------------------------------------------ +// Static and dynamic alignment control +// +// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES +// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. +// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, +// a default value is automatically computed based on architecture, compiler, and OS. +// +// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} +// to be used to declare statically aligned buffers. +//------------------------------------------------------------------------------------------ + + +/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. + * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, + * so that vectorization doesn't affect binary compatibility. + * + * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link + * vectorized and non-vectorized code. + * + * FIXME: this code can be cleaned up once we switch to proper C++11 only. + */ +#if (defined EIGEN_CUDACC) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_HAS_ALIGNAS + #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n) + #define EIGEN_ALIGNOF(x) alignof(x) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_MSVC + #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_SUNCC + // FIXME not sure about this one: + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#else + #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler +#endif + +// If the user explicitly disable vectorization, then we also disable alignment +#if defined(EIGEN_DONT_VECTORIZE) + #if defined(EIGEN_GPUCC) + // GPU code is always vectorized and requires memory alignment for + // statically allocated buffers. + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 + #else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 + #endif +#elif defined(__AVX512F__) + // 64 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 +#elif defined(__AVX__) + // 32 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 +#else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 +#endif + + +// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense +#define EIGEN_MIN_ALIGN_BYTES 16 + +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. + +#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. +#endif + +// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated +// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 +#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) + #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES + #undef EIGEN_MAX_STATIC_ALIGN_BYTES + #endif + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 +#endif + +#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES + + // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES + + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable + // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always + // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in + // certain common platform (compiler+architecture combinations) to avoid these problems. + // Only static alignment is really problematic (relies on nonstandard compiler extensions), + // try to keep heap alignment even when we have to disable static alignment. + #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) + // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. + // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. + // 4.8 and newer seem definitely unaffected. + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #else + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 + #endif + + // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX + #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ + && !EIGEN_GCC3_OR_OLDER \ + && !EIGEN_COMP_SUNCC \ + && !EIGEN_OS_QNX + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 + #else + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 + #endif + + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT + #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES + #else + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 + #endif + +#endif + +// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES +#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. +// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) +// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). +// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. + + +// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY +#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) +#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) +#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#else +#define EIGEN_ALIGN_MAX +#endif + + +// Dynamic alignment control + +#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 +#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. +#endif + +#ifdef EIGEN_DONT_ALIGN + #ifdef EIGEN_MAX_ALIGN_BYTES + #undef EIGEN_MAX_ALIGN_BYTES + #endif + #define EIGEN_MAX_ALIGN_BYTES 0 +#elif !defined(EIGEN_MAX_ALIGN_BYTES) + #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#endif + +#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#else +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES +#endif + + +#ifndef EIGEN_UNALIGNED_VECTORIZE +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif + +//---------------------------------------------------------------------- + +// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into +// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks +#if EIGEN_MAX_ALIGN_BYTES==0 + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif + + +// The following (except #include and _M_IX86_FP ??) can likely be +// removed as gcc 4.1 and msvc 2008 are not supported anyways. +#if EIGEN_COMP_MSVC + #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled + #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later + // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. + #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 + #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER + #endif + #endif +#else + #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) + #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC + #endif +#endif + +#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC)) + + #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) + + // Defines symbols for compile-time detection of which instructions are + // used. + // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SSE + #define EIGEN_VECTORIZE_SSE2 + + // Detect sse3/ssse3/sse4: + // gcc and icc defines __SSE3__, ... + // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you + // want to force the use of those instructions with msvc. + #ifdef __SSE3__ + #define EIGEN_VECTORIZE_SSE3 + #endif + #ifdef __SSSE3__ + #define EIGEN_VECTORIZE_SSSE3 + #endif + #ifdef __SSE4_1__ + #define EIGEN_VECTORIZE_SSE4_1 + #endif + #ifdef __SSE4_2__ + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX__ + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX2__ + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__)) + // MSVC does not expose a switch dedicated for FMA + // For MSVC, AVX2 => FMA + #define EIGEN_VECTORIZE_FMA + #endif + #if defined(__AVX512F__) + #ifndef EIGEN_VECTORIZE_FMA + #if EIGEN_COMP_GNUC + #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). + #else + #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). + #endif + #endif + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_FMA + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #ifndef EIGEN_USE_SYCL + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif + #ifdef __AVX512BF16__ + #define EIGEN_VECTORIZE_AVX512BF16 + #endif + #endif + #endif + + // Disable AVX support on broken xcode versions + #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 ) + // A nasty bug in the clang compiler shipped with xcode in a common compilation situation + // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1 + #ifdef EIGEN_VECTORIZE_AVX + #undef EIGEN_VECTORIZE_AVX + #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. " + #ifdef EIGEN_VECTORIZE_AVX2 + #undef EIGEN_VECTORIZE_AVX2 + #endif + #ifdef EIGEN_VECTORIZE_FMA + #undef EIGEN_VECTORIZE_FMA + #endif + #ifdef EIGEN_VECTORIZE_AVX512 + #undef EIGEN_VECTORIZE_AVX512 + #endif + #ifdef EIGEN_VECTORIZE_AVX512DQ + #undef EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef EIGEN_VECTORIZE_AVX512ER + #undef EIGEN_VECTORIZE_AVX512ER + #endif + #endif + // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with -macosx-version-min=10.15 and AVX + // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests + // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases + // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)" XCode 11.0 <- Produces many segfault and core dumping tests + // with -macosx-version-min=10.15 and AVX + // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with + // -macosx-version-min=10.15 and AVX + #endif + + // include files + + // This extern "C" works around a MINGW-w64 compilation issue + // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 + // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). + // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations + // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; + // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. + // notice that since these are C headers, the extern "C" is theoretically needed anyways. + extern "C" { + // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. + // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: + #if EIGEN_COMP_ICC >= 1110 + #include + #else + #include + #include + #include + #ifdef EIGEN_VECTORIZE_SSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_1 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_2 + #include + #endif + #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) + #include + #endif + #endif + } // end extern "C" + + #elif defined __VSX__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_VSX + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif defined __ALTIVEC__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ALTIVEC + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif ((defined __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_NEON + #include + + // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and + // will not select the backend automatically + #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SVE + #include + + // Since we depend on knowing SVE vector lengths at compile-time, we need + // to ensure a fixed lengths is set + #if defined __ARM_FEATURE_SVE_BITS + #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS + #else +#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." +#endif + +#elif (defined __s390x__ && defined __VEC__) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_ZVECTOR +#include + +#elif defined __mips_msa + +// Limit MSA optimizations to little-endian CPUs for now. +// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#if defined(__LP64__) +#define EIGEN_MIPS_64 +#else +#define EIGEN_MIPS_32 +#endif +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_MSA +#include +#endif + +#endif +#endif + +// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all +// compilers seem to follow this. We therefore include it explicitly. +// See also: https://bugs.llvm.org/show_bug.cgi?id=47955 +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + #include +#endif + +#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) + // We can use the optimized fp16 to float and float to fp16 conversion routines + #define EIGEN_HAS_FP16_C + + #if defined(EIGEN_COMP_CLANG) + // Workaround for clang: The FP16C intrinsics for clang are included by + // immintrin.h, as opposed to emmintrin.h as suggested by Intel: + // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 + #include + #endif +#endif + +#if defined EIGEN_CUDACC + #define EIGEN_VECTORIZE_GPU + #include + #if EIGEN_CUDA_SDK_VER >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #endif +#endif + +#if defined(EIGEN_HAS_CUDA_FP16) + #include + #include +#endif + +#if defined(EIGEN_HIPCC) + #define EIGEN_VECTORIZE_GPU + #include + #define EIGEN_HAS_HIP_FP16 + #include +#endif + + +/** \brief Namespace containing all symbols from the %Eigen library. */ +namespace Eigen { + +inline static const char *SimdInstructionSetsInUse(void) { +#if defined(EIGEN_VECTORIZE_AVX512) + return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_1) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; +#elif defined(EIGEN_VECTORIZE_SSSE3) + return "SSE, SSE2, SSE3, SSSE3"; +#elif defined(EIGEN_VECTORIZE_SSE3) + return "SSE, SSE2, SSE3"; +#elif defined(EIGEN_VECTORIZE_SSE2) + return "SSE, SSE2"; +#elif defined(EIGEN_VECTORIZE_ALTIVEC) + return "AltiVec"; +#elif defined(EIGEN_VECTORIZE_VSX) + return "VSX"; +#elif defined(EIGEN_VECTORIZE_NEON) + return "ARM NEON"; +#elif defined(EIGEN_VECTORIZE_SVE) + return "ARM SVE"; +#elif defined(EIGEN_VECTORIZE_ZVECTOR) + return "S390X ZVECTOR"; +#elif defined(EIGEN_VECTORIZE_MSA) + return "MIPS MSA"; +#else + return "None"; +#endif +} + +} // end namespace Eigen + + +#endif // EIGEN_CONFIGURE_VECTORIZATION_H diff --git a/inst/include/Eigen/src/Core/util/Constants.h b/inst/include/Eigen/src/Core/util/Constants.h index 7587d684..35dcaa7b 100644 --- a/inst/include/Eigen/src/Core/util/Constants.h +++ b/inst/include/Eigen/src/Core/util/Constants.h @@ -3,6 +3,7 @@ // // Copyright (C) 2008-2015 Gael Guennebaud // Copyright (C) 2007-2009 Benoit Jacob +// Copyright (C) 2020, Arm Limited and Contributors // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -25,6 +26,10 @@ const int Dynamic = -1; */ const int DynamicIndex = 0xffffff; +/** This value means that the increment to go from one value to another in a sequence is not constant for each step. + */ +const int UndefinedIncr = 0xfffffe; + /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm(). * The value Infinity there means the L-infinity norm. */ @@ -152,7 +157,7 @@ const unsigned int DirectAccessBit = 0x40; /** \deprecated \ingroup flags * * means the first coefficient packet is guaranteed to be aligned. - * An expression cannot has the AlignedBit without the PacketAccessBit flag. + * An expression cannot have the AlignedBit without the PacketAccessBit flag. * In other words, this means we are allow to perform an aligned packet access to the first element regardless * of the expression kind: * \code @@ -250,12 +255,6 @@ enum AlignmentType { #endif }; -/** \ingroup enums - * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */ -// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox -// TODO: find out what to do with that. Adapt the AlignedBox API ? -enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; - /** \ingroup enums * Enum containing possible values for the \p Direction parameter of * Reverse, PartialReduxExpr and VectorwiseOp. */ @@ -330,9 +329,20 @@ enum StorageOptions { * Enum for specifying whether to apply or solve on the left or right. */ enum SideType { /** Apply transformation on the left. */ - OnTheLeft = 1, + OnTheLeft = 1, /** Apply transformation on the right. */ - OnTheRight = 2 + OnTheRight = 2 +}; + +/** \ingroup enums + * Enum for specifying NaN-propagation behavior, e.g. for coeff-wise min/max. */ +enum NaNPropagationOptions { + /** Implementation defined behavior if NaNs are present. */ + PropagateFast = 0, + /** Always propagate NaNs. */ + PropagateNaN, + /** Always propagate not-NaNs. */ + PropagateNumbers }; /* the following used to be written as: @@ -464,6 +474,8 @@ namespace Architecture AltiVec = 0x2, VSX = 0x3, NEON = 0x4, + MSA = 0x5, + SVE = 0x6, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -472,6 +484,10 @@ namespace Architecture Target = VSX #elif defined EIGEN_VECTORIZE_NEON Target = NEON +#elif defined EIGEN_VECTORIZE_SVE + Target = SVE +#elif defined EIGEN_VECTORIZE_MSA + Target = MSA #else Target = Generic #endif diff --git a/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h b/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h index f346a120..85229e8e 100755 --- a/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h @@ -37,10 +37,21 @@ #elif defined __clang__ // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant // this is really a stupid warning as it warns on compile-time expressions involving enums - //#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS - // #pragma clang diagnostic push - //#endif - //#pragma clang diagnostic ignored "-Wconstant-logical-operand" + // #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + // #pragma clang diagnostic push + // #endif + // #pragma clang diagnostic ignored "-Wconstant-logical-operand" + // #if __clang_major__ >= 3 && __clang_minor__ >= 5 + // #pragma clang diagnostic ignored "-Wabsolute-value" + // #endif + // #if __clang_major__ >= 10 + // #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" + // #endif + // #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // // warning: generic selections are a C11-specific feature + // // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + // #pragma clang diagnostic ignored "-Wc11-extensions" + // #endif #elif defined __GNUC__ diff --git a/inst/include/Eigen/src/Core/util/ForwardDeclarations.h b/inst/include/Eigen/src/Core/util/ForwardDeclarations.h index 134544f9..2f9cc449 100644 --- a/inst/include/Eigen/src/Core/util/ForwardDeclarations.h +++ b/inst/include/Eigen/src/Core/util/ForwardDeclarations.h @@ -79,6 +79,8 @@ template class ForceAlignedAccess; template class SwapWrapper; template class Block; +template class IndexedView; +template class Reshaped; template class VectorBlock; template class Transpose; @@ -108,7 +110,7 @@ template class TranspositionsWrapper; template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class MapBase; -template class Stride; +template class Stride; template class InnerStride; template class OuterStride; template > class Map; @@ -129,6 +131,10 @@ template class SolverBase; template class InnerIterator; namespace internal { +template class generic_randaccess_stl_iterator; +template class pointer_based_stl_iterator; +template class subvector_stl_iterator; +template class subvector_stl_reverse_iterator; template struct kernel_retval_base; template struct kernel_retval; template struct image_retval_base; @@ -174,14 +180,15 @@ template struct scalar_sum_op; template struct scalar_difference_op; template struct scalar_conj_product_op; -template struct scalar_min_op; -template struct scalar_max_op; +template struct scalar_min_op; +template struct scalar_max_op; template struct scalar_opposite_op; template struct scalar_conjugate_op; template struct scalar_real_op; template struct scalar_imag_op; template struct scalar_abs_op; template struct scalar_abs2_op; +template struct scalar_absolute_difference_op; template struct scalar_sqrt_op; template struct scalar_rsqrt_op; template struct scalar_exp_op; @@ -198,7 +205,7 @@ template struct scalar_cast_op; template struct scalar_random_op; template struct scalar_constant_op; template struct scalar_identity_op; -template struct scalar_sign_op; +template struct scalar_sign_op; template struct scalar_pow_op; template struct scalar_hypot_op; template struct scalar_product_op; @@ -209,11 +216,27 @@ template struct scalar_lgamma_op; template struct scalar_digamma_op; template struct scalar_erf_op; template struct scalar_erfc_op; +template struct scalar_ndtri_op; template struct scalar_igamma_op; template struct scalar_igammac_op; template struct scalar_zeta_op; template struct scalar_betainc_op; +// Bessel functions in SpecialFunctions module +template struct scalar_bessel_i0_op; +template struct scalar_bessel_i0e_op; +template struct scalar_bessel_i1_op; +template struct scalar_bessel_i1e_op; +template struct scalar_bessel_j0_op; +template struct scalar_bessel_y0_op; +template struct scalar_bessel_j1_op; +template struct scalar_bessel_y1_op; +template struct scalar_bessel_k0_op; +template struct scalar_bessel_k0e_op; +template struct scalar_bessel_k1_op; +template struct scalar_bessel_k1e_op; + + } // end namespace internal struct IOFormat; @@ -251,6 +274,7 @@ template class HouseholderQR; template class ColPivHouseholderQR; template class FullPivHouseholderQR; template class CompleteOrthogonalDecomposition; +template class SVDBase; template class JacobiSVD; template class BDCSVD; template class LLT; diff --git a/inst/include/Eigen/src/Core/util/IndexedViewHelper.h b/inst/include/Eigen/src/Core/util/IndexedViewHelper.h new file mode 100644 index 00000000..f85de305 --- /dev/null +++ b/inst/include/Eigen/src/Core/util/IndexedViewHelper.h @@ -0,0 +1,186 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_INDEXED_VIEW_HELPER_H +#define EIGEN_INDEXED_VIEW_HELPER_H + +namespace Eigen { + +namespace internal { +struct symbolic_last_tag {}; +} + +/** \var last + * \ingroup Core_Module + * + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns + * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * This symbolic placeholder supports standard arithmetic operations. + * + * A typical usage example would be: + * \code + * using namespace Eigen; + * using Eigen::last; + * VectorXd v(n); + * v(seq(2,last-2)).setOnes(); + * \endcode + * + * \sa end + */ +static const symbolic::SymbolExpr last; // PLEASE use Eigen::last instead of Eigen::placeholders::last + +/** \var lastp1 + * \ingroup Core_Module + * + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically + * reference the last+1 element/row/columns of the underlying vector or matrix once + * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * This symbolic placeholder supports standard arithmetic operations. + * It is essentially an alias to last+fix<1>. + * + * \sa last + */ +#ifdef EIGEN_PARSED_BY_DOXYGEN +static const auto lastp1 = last+fix<1>; +#else +// Using a FixedExpr<1> expression is important here to make sure the compiler +// can fully optimize the computation starting indices with zero overhead. +static const symbolic::AddExpr,symbolic::ValueExpr > > lastp1(last+fix<1>()); +#endif + +namespace internal { + + // Replace symbolic last/end "keywords" by their true runtime value +inline Index eval_expr_given_size(Index x, Index /* size */) { return x; } + +template +FixedInt eval_expr_given_size(FixedInt x, Index /*size*/) { return x; } + +template +Index eval_expr_given_size(const symbolic::BaseExpr &x, Index size) +{ + return x.derived().eval(last=size-1); +} + +// Extract increment/step at compile time +template struct get_compile_time_incr { + enum { value = UndefinedIncr }; +}; + +// Analogue of std::get<0>(x), but tailored for our needs. +template +EIGEN_CONSTEXPR Index first(const T& x) EIGEN_NOEXCEPT { return x.first(); } + +// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice +// The generic implementation is a no-op +template +struct IndexedViewCompatibleType { + typedef T type; +}; + +template +const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) { return x; } + +//-------------------------------------------------------------------------------- +// Handling of a single Index +//-------------------------------------------------------------------------------- + +struct SingleRange { + enum { + SizeAtCompileTime = 1 + }; + SingleRange(Index val) : m_value(val) {} + Index operator[](Index) const { return m_value; } + static EIGEN_CONSTEXPR Index size() EIGEN_NOEXCEPT { return 1; } + Index first() const EIGEN_NOEXCEPT { return m_value; } + Index m_value; +}; + +template<> struct get_compile_time_incr { + enum { value = 1 }; // 1 or 0 ?? +}; + +// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods) +template +struct IndexedViewCompatibleType::value>::type> { + // Here we could simply use Array, but maybe it's less work for the compiler to use + // a simpler wrapper as SingleRange + //typedef Eigen::Array type; + typedef SingleRange type; +}; + +template +struct IndexedViewCompatibleType::value>::type> { + typedef SingleRange type; +}; + + +template +typename enable_if::value,SingleRange>::type +makeIndexedViewCompatible(const T& id, Index size, SpecializedType) { + return eval_expr_given_size(id,size); +} + +//-------------------------------------------------------------------------------- +// Handling of all +//-------------------------------------------------------------------------------- + +struct all_t { all_t() {} }; + +// Convert a symbolic 'all' into a usable range type +template +struct AllRange { + enum { SizeAtCompileTime = XprSize }; + AllRange(Index size = XprSize) : m_size(size) {} + EIGEN_CONSTEXPR Index operator[](Index i) const EIGEN_NOEXCEPT { return i; } + EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_size.value(); } + EIGEN_CONSTEXPR Index first() const EIGEN_NOEXCEPT { return 0; } + variable_if_dynamic m_size; +}; + +template +struct IndexedViewCompatibleType { + typedef AllRange type; +}; + +template +inline AllRange::value> makeIndexedViewCompatible(all_t , XprSizeType size, SpecializedType) { + return AllRange::value>(size); +} + +template struct get_compile_time_incr > { + enum { value = 1 }; +}; + +} // end namespace internal + + +/** \var all + * \ingroup Core_Module + * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns + */ +static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + + +namespace placeholders { + typedef symbolic::SymbolExpr last_t; + typedef symbolic::AddExpr,symbolic::ValueExpr > > end_t; + typedef Eigen::internal::all_t all_t; + + EIGEN_DEPRECATED static const all_t all = Eigen::all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + EIGEN_DEPRECATED static const last_t last = Eigen::last; // PLEASE use Eigen::last instead of Eigen::placeholders::last + EIGEN_DEPRECATED static const end_t end = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end +} + +} // end namespace Eigen + +#endif // EIGEN_INDEXED_VIEW_HELPER_H diff --git a/inst/include/Eigen/src/Core/util/IntegralConstant.h b/inst/include/Eigen/src/Core/util/IntegralConstant.h new file mode 100644 index 00000000..945d426e --- /dev/null +++ b/inst/include/Eigen/src/Core/util/IntegralConstant.h @@ -0,0 +1,272 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_INTEGRAL_CONSTANT_H +#define EIGEN_INTEGRAL_CONSTANT_H + +namespace Eigen { + +namespace internal { + +template class FixedInt; +template class VariableAndFixedInt; + +/** \internal + * \class FixedInt + * + * This class embeds a compile-time integer \c N. + * + * It is similar to c++11 std::integral_constant but with some additional features + * such as: + * - implicit conversion to int + * - arithmetic and some bitwise operators: -, +, *, /, %, &, | + * - c++98/14 compatibility with fix and fix() syntax to define integral constants. + * + * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to + * be created by the user using Eigen::fix or Eigen::fix(). In C++98-11, the former syntax does + * not create a FixedInt instance but rather a point to function that needs to be \em cleaned-up + * using the generic helper: + * \code + * internal::cleanup_index_type::type + * internal::cleanup_index_type::type + * \endcode + * where T can a FixedInt, a pointer to function FixedInt (*)(), or numerous other integer-like representations. + * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. + * + * For convenience, you can extract the compile-time value \c N in a generic way using the following helper: + * \code + * internal::get_fixed_value::value + * \endcode + * that will give you \c N if T equals FixedInt or FixedInt (*)(), and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). + * + * \sa fix, class VariableAndFixedInt + */ +template class FixedInt +{ +public: + static const int value = N; + EIGEN_CONSTEXPR operator int() const { return value; } + FixedInt() {} + FixedInt( VariableAndFixedInt other) { + #ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(other); + #endif + eigen_internal_assert(int(other)==N); + } + + FixedInt<-N> operator-() const { return FixedInt<-N>(); } + template + FixedInt operator+( FixedInt) const { return FixedInt(); } + template + FixedInt operator-( FixedInt) const { return FixedInt(); } + template + FixedInt operator*( FixedInt) const { return FixedInt(); } + template + FixedInt operator/( FixedInt) const { return FixedInt(); } + template + FixedInt operator%( FixedInt) const { return FixedInt(); } + template + FixedInt operator|( FixedInt) const { return FixedInt(); } + template + FixedInt operator&( FixedInt) const { return FixedInt(); } + +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + // Needed in C++14 to allow fix(): + FixedInt operator() () const { return *this; } + + VariableAndFixedInt operator() (int val) const { return VariableAndFixedInt(val); } +#else + FixedInt ( FixedInt (*)() ) {} +#endif + +#if EIGEN_HAS_CXX11 + FixedInt(std::integral_constant) {} +#endif +}; + +/** \internal + * \class VariableAndFixedInt + * + * This class embeds both a compile-time integer \c N and a runtime integer. + * Both values are supposed to be equal unless the compile-time value \c N has a special + * value meaning that the runtime-value should be used. Depending on the context, this special + * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for + * quantities that can be negative). + * + * It is the return-type of the function Eigen::fix(int), and most of the time this is the only + * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt. + * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert + * it to either a true compile-time quantity (i.e. a FixedInt), or to a runtime quantity (e.g., an Index) + * using the following generic helper: + * \code + * internal::cleanup_index_type::type + * internal::cleanup_index_type::type + * \endcode + * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations. + * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. + * + * For convenience, you can also extract the compile-time value \c N using the following helper: + * \code + * internal::get_fixed_value::value + * \endcode + * that will give you \c N if T equals VariableAndFixedInt, and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). + * + * \sa fix(int), class FixedInt + */ +template class VariableAndFixedInt +{ +public: + static const int value = N; + operator int() const { return m_value; } + VariableAndFixedInt(int val) { m_value = val; } +protected: + int m_value; +}; + +template struct get_fixed_value { + static const int value = Default; +}; + +template struct get_fixed_value,Default> { + static const int value = N; +}; + +#if !EIGEN_HAS_CXX14 +template struct get_fixed_value (*)(),Default> { + static const int value = N; +}; +#endif + +template struct get_fixed_value,Default> { + static const int value = N ; +}; + +template +struct get_fixed_value,Default> { + static const int value = N; +}; + +template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } +#if !EIGEN_HAS_CXX14 +template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } +#endif + +// Cleanup integer/FixedInt/VariableAndFixedInt/etc types: + +// By default, no cleanup: +template struct cleanup_index_type { typedef T type; }; + +// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index +template struct cleanup_index_type::value>::type> { typedef Index type; }; + +#if !EIGEN_HAS_CXX14 +// In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: +template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; +#endif + +// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value: +template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; +// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index): +template struct cleanup_index_type, DynamicKey> { typedef Index type; }; + +#if EIGEN_HAS_CXX11 +template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; +#endif + +} // end namespace internal + +#ifndef EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES +template +static const internal::FixedInt fix{}; +#else +template +inline internal::FixedInt fix() { return internal::FixedInt(); } + +// The generic typename T is mandatory. Otherwise, a code like fix could refer to either the function above or this next overload. +// This way a code like fix can only refer to the previous function. +template +inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAndFixedInt(internal::convert_index(val)); } +#endif + +#else // EIGEN_PARSED_BY_DOXYGEN + +/** \var fix() + * \ingroup Core_Module + * + * This \em identifier permits to construct an object embedding a compile-time integer \c N. + * + * \tparam N the compile-time integer value + * + * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to them: + * \code + * seqN(10,fix<4>,fix<-3>) // <=> [10 7 4 1] + * \endcode + * + * See also the function fix(int) to pass both a compile-time and runtime value. + * + * In c++14, it is implemented as: + * \code + * template static const internal::FixedInt fix{}; + * \endcode + * where internal::FixedInt is an internal template class similar to + * \c std::integral_constant + * Here, \c fix is thus an object of type \c internal::FixedInt. + * + * In c++98/11, it is implemented as a function: + * \code + * template inline internal::FixedInt fix(); + * \endcode + * Here internal::FixedInt is thus a pointer to function. + * + * If for some reason you want a true object in c++98 then you can write: \code fix() \endcode which is also valid in c++14. + * + * \sa fix(int), seq, seqN + */ +template +static const auto fix(); + +/** \fn fix(int) + * \ingroup Core_Module + * + * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val. + * + * \tparam N the compile-time integer value + * \param val the fallback runtime integer value + * + * This function is a more general version of the \ref fix identifier/function that can be used in template code + * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers + * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers + * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val + * will be used as a fallback. + * + * A typical use case would be: + * \code + * template void foo(const MatrixBase &mat) { + * const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2; + * const int n = mat.rows()/2; + * ... mat( seqN(0,fix(n) ) ...; + * } + * \endcode + * In this example, the function Eigen::seqN knows that the second argument is expected to be a size. + * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed, and converted to an Eigen::Index of value \c n. + * Otherwise, the runtime-value \c n will be dissmissed, and the returned ArithmeticSequence will be of the exact same type as seqN(0,fix) . + * + * \sa fix, seqN, class ArithmeticSequence + */ +template +static const auto fix(int val); + +#endif // EIGEN_PARSED_BY_DOXYGEN + +} // end namespace Eigen + +#endif // EIGEN_INTEGRAL_CONSTANT_H diff --git a/inst/include/Eigen/src/Core/util/MKL_support.h b/inst/include/Eigen/src/Core/util/MKL_support.h index b7d6ecc7..17963fad 100755 --- a/inst/include/Eigen/src/Core/util/MKL_support.h +++ b/inst/include/Eigen/src/Core/util/MKL_support.h @@ -55,7 +55,11 @@ #if defined EIGEN_USE_MKL -# include +# if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL) +# define MKL_DIRECT_CALL +# define MKL_DIRECT_CALL_JUST_SET +# endif +# include /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ # ifndef INTEL_MKL_VERSION # undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ @@ -69,6 +73,9 @@ # undef EIGEN_USE_MKL_VML # undef EIGEN_USE_LAPACKE_STRICT # undef EIGEN_USE_LAPACKE +# ifdef MKL_DIRECT_CALL_JUST_SET +# undef MKL_DIRECT_CALL +# endif # endif #endif diff --git a/inst/include/Eigen/src/Core/util/Macros.h b/inst/include/Eigen/src/Core/util/Macros.h index 6b0399eb..986c3d44 100644 --- a/inst/include/Eigen/src/Core/util/Macros.h +++ b/inst/include/Eigen/src/Core/util/Macros.h @@ -11,19 +11,56 @@ #ifndef EIGEN_MACROS_H #define EIGEN_MACROS_H +//------------------------------------------------------------------------------------------ +// Eigen version and basic defaults +//------------------------------------------------------------------------------------------ + #define EIGEN_WORLD_VERSION 3 -#define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 9 +#define EIGEN_MAJOR_VERSION 4 +#define EIGEN_MINOR_VERSION 0 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ EIGEN_MINOR_VERSION>=z)))) +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor +#else +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor +#endif + +#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t +#endif + +// Upperbound on the C++ version to use. +// Expected values are 03, 11, 14, 17, etc. +// By default, let's use an arbitrarily large C++ version. +#ifndef EIGEN_MAX_CPP_VER +#define EIGEN_MAX_CPP_VER 99 +#endif + +/** Allows to disable some optimizations which might affect the accuracy of the result. + * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. + * They currently include: + * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. + */ +#ifndef EIGEN_FAST_MATH +#define EIGEN_FAST_MATH 1 +#endif + +#ifndef EIGEN_STACK_ALLOCATION_LIMIT +// 131072 == 128 KB +#define EIGEN_STACK_ALLOCATION_LIMIT 131072 +#endif + +//------------------------------------------------------------------------------------------ // Compiler identification, EIGEN_COMP_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC #ifdef __GNUC__ - #define EIGEN_COMP_GNUC 1 + #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__) #else #define EIGEN_COMP_GNUC 0 #endif @@ -35,6 +72,12 @@ #define EIGEN_COMP_CLANG 0 #endif +/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML +#if defined(__castxml__) + #define EIGEN_COMP_CASTXML 1 +#else + #define EIGEN_COMP_CASTXML 0 +#endif /// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm #if defined(__llvm__) @@ -71,14 +114,44 @@ #define EIGEN_COMP_MSVC 0 #endif +#if defined(__NVCC__) +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) + #define EIGEN_COMP_NVCC __CUDACC_VER__ +#else + #error "NVCC did not define compiler version." +#endif +#else + #define EIGEN_COMP_NVCC 0 +#endif + // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: -// name ver MSC_VER -// 2008 9 1500 -// 2010 10 1600 -// 2012 11 1700 -// 2013 12 1800 -// 2015 14 1900 -// "15" 15 1900 +// name ver MSC_VER +// 2008 9 1500 +// 2010 10 1600 +// 2012 11 1700 +// 2013 12 1800 +// 2015 14 1900 +// "15" 15 1900 +// 2017-14.1 15.0 1910 +// 2017-14.11 15.3 1911 +// 2017-14.12 15.5 1912 +// 2017-14.13 15.6 1913 +// 2017-14.14 15.7 1914 + +/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise. +#if defined(_MSVC_LANG) + #define EIGEN_COMP_MSVC_LANG _MSVC_LANG +#else + #define EIGEN_COMP_MSVC_LANG 0 +#endif + +// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG: +// MSVC option Standard MSVC_LANG +// /std:c++14 (default as of VS 2019) C++14 201402L +// /std:c++17 C++17 201703L +// /std:c++latest >C++17 >201703L /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) @@ -87,16 +160,21 @@ #define EIGEN_COMP_MSVC_STRICT 0 #endif -/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++ -#if defined(__IBMCPP__) || defined(__xlc__) - #define EIGEN_COMP_IBM 1 +/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++ +// XLC version +// 3.1 0x0301 +// 4.5 0x0405 +// 5.0 0x0500 +// 12.1 0x0C01 +#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__) + #define EIGEN_COMP_IBM __xlC__ #else #define EIGEN_COMP_IBM 0 #endif -/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler +/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler #if defined(__PGI) - #define EIGEN_COMP_PGI 1 + #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__) #else #define EIGEN_COMP_PGI 0 #endif @@ -108,7 +186,7 @@ #define EIGEN_COMP_ARM 0 #endif -/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler +/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler #if defined(__EMSCRIPTEN__) #define EIGEN_COMP_EMSCRIPTEN 1 #else @@ -142,9 +220,13 @@ #endif + +//------------------------------------------------------------------------------------------ // Architecture identification, EIGEN_ARCH_* +//------------------------------------------------------------------------------------------ -#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) + +#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__amd64) #define EIGEN_ARCH_x86_64 1 #else #define EIGEN_ARCH_x86_64 0 @@ -170,18 +252,61 @@ #endif /// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64 -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) #define EIGEN_ARCH_ARM64 1 #else #define EIGEN_ARCH_ARM64 0 #endif +/// \internal EIGEN_ARCH_ARM_OR_ARM64 set to 1 if the architecture is ARM or ARM64 #if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64 #define EIGEN_ARCH_ARM_OR_ARM64 1 #else #define EIGEN_ARCH_ARM_OR_ARM64 0 #endif +/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater. +#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8 +#define EIGEN_ARCH_ARMV8 1 +#else +#define EIGEN_ARCH_ARMV8 0 +#endif + + +/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE +/// compliant Arm fp16 type +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16 + #if defined(__ARM_FP16_FORMAT_IEEE) + #define EIGEN_HAS_ARM64_FP16 1 + #else + #define EIGEN_HAS_ARM64_FP16 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture +/// supports Neon vector intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 + #else + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture +/// supports Neon scalar intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 + #endif + #endif +#endif + /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -212,7 +337,9 @@ +//------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant #if defined(__unix__) || defined(__unix) @@ -299,9 +426,17 @@ #define EIGEN_OS_WIN_STRICT 0 #endif -/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN +/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN +// compiler solaris __SUNPRO_C +// version studio +// 5.7 10 0x570 +// 5.8 11 0x580 +// 5.9 12 0x590 +// 5.10 12.1 0x5100 +// 5.11 12.2 0x5110 +// 5.12 12.3 0x5120 #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__)) - #define EIGEN_OS_SUN 1 + #define EIGEN_OS_SUN __SUNPRO_C #else #define EIGEN_OS_SUN 0 #endif @@ -314,26 +449,137 @@ #endif +//------------------------------------------------------------------------------------------ +// Detect GPU compilers and architectures +//------------------------------------------------------------------------------------------ -#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG - // see bug 89 - #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0 -#else - #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 +// NVCC is not supported as the target platform for HIPCC +// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive +#if defined(__NVCC__) && defined(__HIPCC__) + #error "NVCC as the target platform for HIPCC is currently not supported." #endif -// This macro can be used to prevent from macro expansion, e.g.: -// std::max EIGEN_NOT_A_MACRO(a,b) -#define EIGEN_NOT_A_MACRO +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + // Means the compiler is either nvcc or clang with CUDA enabled + #define EIGEN_CUDACC __CUDACC__ +#endif -#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + // Means we are generating code for the device + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + +#if defined(EIGEN_CUDACC) +#include + #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) #else -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor + #define EIGEN_CUDA_SDK_VER 0 #endif -#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) + #define EIGEN_HIPCC __HIPCC__ + + // We need to include hip_runtime.h here because it pulls in + // ++ hip_common.h which contains the define for __HIP_DEVICE_COMPILE__ + // ++ host_defines.h which contains the defines for the __host__ and __device__ macros + #include + + #if defined(__HIP_DEVICE_COMPILE__) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + #endif + + // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute + // value to 1024. The compiler assigns a default value of 256 when the attribute is not + // specified. This results in failures on the HIP platform, for cases when a GPU kernel + // without an explicit launch_bounds attribute is called with a threads_per_block value + // greater than 256. + // + // This is a regression in functioanlity and is expected to be fixed within the next + // couple of ROCm releases (compiler will go back to using 1024 value as the default) + // + // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds + // attribute. + + #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024) + +#endif + +#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) +#define EIGEN_HIP_LAUNCH_BOUNDS_1024 +#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) + +// Unify CUDA/HIPCC + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC +// +#define EIGEN_GPUCC +// +// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels +// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels +// +// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels. +// For those cases, the corresponding code should be guarded with +// #if defined(EIGEN_GPUCC) +// instead of +// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDACC) +// +#endif + +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE +// +#define EIGEN_GPU_COMPILE_PHASE +// +// GPU compilers (HIPCC, NVCC) typically do two passes over the source code, +// + one to compile the source for the "host" (ie CPU) +// + another to compile the source for the "device" (ie. GPU) +// +// Code that needs to enabled only during the either the "host" or "device" compilation phase +// needs to be guarded with a macro that indicates the current compilation phase +// +// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP +// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA +// +// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA +// For those cases, the code should be guarded with +// #if defined(EIGEN_GPU_COMPILE_PHASE) +// instead of +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDA_ARCH) +// +#endif + +#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) +// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro. +// In most cases we want to check if both macros are defined which can be done using the define below. +#define SYCL_DEVICE_ONLY +#endif + +//------------------------------------------------------------------------------------------ +// Detect Compiler/Architecture/OS specific features +//------------------------------------------------------------------------------------------ + +#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG + // see bug 89 + #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0 +#else + #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 #endif // Cross compiler wrapper around LLVM's __has_builtin @@ -349,26 +595,79 @@ # define __has_feature(x) 0 #endif -// Upperbound on the C++ version to use. -// Expected values are 03, 11, 14, 17, etc. -// By default, let's use an arbitrarily large C++ version. -#ifndef EIGEN_MAX_CPP_VER -#define EIGEN_MAX_CPP_VER 99 +// Some old compilers do not support template specializations like: +// template void foo(const T x[N]); +#if !( EIGEN_COMP_CLANG && ( (EIGEN_COMP_CLANG<309) \ + || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000))) \ + || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1 +#else +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 +#endif + +// The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that +// works for both platforms, indicating the C++ standard version number. +// +// With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will +// report 199711L regardless of the language standard specified via /std. +// We need to rely on _MSVC_LANG instead, which is only available after +// VS2015.3. +#if EIGEN_COMP_MSVC_LANG > 0 +#define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG +#elif EIGEN_COMP_MSVC >= 1900 +#define EIGEN_CPLUSPLUS 201103L +#elif defined(__cplusplus) +#define EIGEN_CPLUSPLUS __cplusplus +#else +#define EIGEN_CPLUSPLUS 0 +#endif + +// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler. +// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER +// is defined to 17. +#if EIGEN_CPLUSPLUS > 201703L + #define EIGEN_COMP_CXXVER 20 +#elif EIGEN_CPLUSPLUS > 201402L + #define EIGEN_COMP_CXXVER 17 +#elif EIGEN_CPLUSPLUS > 201103L + #define EIGEN_COMP_CXXVER 14 +#elif EIGEN_CPLUSPLUS >= 201103L + #define EIGEN_COMP_CXXVER 11 +#else + #define EIGEN_COMP_CXXVER 03 +#endif + +#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14 + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1 + #else + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0 + #endif #endif -#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) + +// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features +// but in practice we should not rely on them but rather on the availabilty of +// individual features as defined later. +// This is why there is no EIGEN_HAS_CXX17. +// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11. +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11 #define EIGEN_HAS_CXX11 1 #else #define EIGEN_HAS_CXX11 0 #endif +#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14 +#define EIGEN_HAS_CXX14 1 +#else +#define EIGEN_HAS_CXX14 0 +#endif // Do we support r-value references? #ifndef EIGEN_HAS_RVALUE_REFERENCES #if EIGEN_MAX_CPP_VER>=11 && \ (__has_feature(cxx_rvalue_references) || \ - (defined(__cplusplus) && __cplusplus >= 201103L) || \ - (EIGEN_COMP_MSVC >= 1600)) + (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) #define EIGEN_HAS_RVALUE_REFERENCES 1 #else #define EIGEN_HAS_RVALUE_REFERENCES 0 @@ -376,12 +675,14 @@ #endif // Does the compiler support C99? +// Need to include to make sure _GLIBCXX_USE_C99 gets defined +#include #ifndef EIGEN_HAS_C99_MATH #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \ - || (EIGEN_COMP_MSVC >= 1900) ) + || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY)) #define EIGEN_HAS_C99_MATH 1 #else #define EIGEN_HAS_C99_MATH 0 @@ -389,14 +690,50 @@ #endif // Does the compiler support result_of? +// result_of was deprecated in c++17 and removed in c++ 20 #ifndef EIGEN_HAS_STD_RESULT_OF -#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))) +#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17 #define EIGEN_HAS_STD_RESULT_OF 1 #else #define EIGEN_HAS_STD_RESULT_OF 0 #endif #endif +// Does the compiler support std::hash? +#ifndef EIGEN_HAS_STD_HASH +// The std::hash struct is defined in C++11 but is not labelled as a __device__ +// function and is not constexpr, so cannot be used on device. +#if EIGEN_HAS_CXX11 && !defined(EIGEN_GPU_COMPILE_PHASE) +#define EIGEN_HAS_STD_HASH 1 +#else +#define EIGEN_HAS_STD_HASH 0 +#endif +#endif // EIGEN_HAS_STD_HASH + +#ifndef EIGEN_HAS_STD_INVOKE_RESULT +#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 +#define EIGEN_HAS_STD_INVOKE_RESULT 1 +#else +#define EIGEN_HAS_STD_INVOKE_RESULT 0 +#endif +#endif + +#ifndef EIGEN_HAS_ALIGNAS +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \ + ( __has_feature(cxx_alignas) \ + || EIGEN_HAS_CXX14 \ + || (EIGEN_COMP_MSVC >= 1800) \ + || (EIGEN_GNUC_AT_LEAST(4,8)) \ + || (EIGEN_COMP_CLANG>=305) \ + || (EIGEN_COMP_ICC>=1500) \ + || (EIGEN_COMP_PGI>=1500) \ + || (EIGEN_COMP_SUNCC>=0x5130)) +#define EIGEN_HAS_ALIGNAS 1 +#else +#define EIGEN_HAS_ALIGNAS 0 +#endif +#endif + // Does the compiler support type_traits? // - full support of type traits was added only to GCC 5.1.0. // - 20150626 corresponds to the last release of 4.x libstdc++ @@ -413,11 +750,13 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES -#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) \ + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#elif EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) && defined(SYCL_DEVICE_ONLY) +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 #else #define EIGEN_HAS_VARIADIC_TEMPLATES 0 #endif @@ -425,27 +764,33 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR + #if defined(EIGEN_CUDACC) + // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above + #if EIGEN_MAX_CPP_VER>=14 && (EIGEN_COMP_CXXVER >= 11 && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500)) + #define EIGEN_HAS_CONSTEXPR 1 + #endif + #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (EIGEN_COMP_CXXVER >= 14) || \ + (EIGEN_GNUC_AT_LEAST(4,8) && (EIGEN_COMP_CXXVER >= 11)) || \ + (EIGEN_COMP_CLANG >= 306 && (EIGEN_COMP_CXXVER >= 11))) + #define EIGEN_HAS_CONSTEXPR 1 + #endif -#ifdef __CUDACC__ -// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) - #define EIGEN_HAS_CONSTEXPR 1 -#endif -#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ - (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))) -#define EIGEN_HAS_CONSTEXPR 1 -#endif + #ifndef EIGEN_HAS_CONSTEXPR + #define EIGEN_HAS_CONSTEXPR 0 + #endif -#ifndef EIGEN_HAS_CONSTEXPR -#define EIGEN_HAS_CONSTEXPR 0 -#endif +#endif // EIGEN_HAS_CONSTEXPR +#if EIGEN_HAS_CONSTEXPR +#define EIGEN_CONSTEXPR constexpr +#else +#define EIGEN_CONSTEXPR #endif // Does the compiler support C++11 math? // Let's be conservative and enable the default C++11 implementation only if we are sure it exists #ifndef EIGEN_HAS_CXX11_MATH - #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + #if EIGEN_MAX_CPP_VER>=11 && ((EIGEN_COMP_CXXVER > 11) || (EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)) #define EIGEN_HAS_CXX11_MATH 1 #else @@ -456,9 +801,8 @@ // Does the compiler support proper C++11 containers? #ifndef EIGEN_HAS_CXX11_CONTAINERS #if EIGEN_MAX_CPP_VER>=11 && \ - ((__cplusplus > 201103L) \ - || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ - || EIGEN_COMP_MSVC >= 1900) + ((EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) #define EIGEN_HAS_CXX11_CONTAINERS 1 #else #define EIGEN_HAS_CXX11_CONTAINERS 0 @@ -469,24 +813,88 @@ #ifndef EIGEN_HAS_CXX11_NOEXCEPT #if EIGEN_MAX_CPP_VER>=11 && \ (__has_feature(cxx_noexcept) \ - || (__cplusplus > 201103L) \ - || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ - || EIGEN_COMP_MSVC >= 1900) + || (EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) #define EIGEN_HAS_CXX11_NOEXCEPT 1 #else #define EIGEN_HAS_CXX11_NOEXCEPT 0 #endif #endif -/** Allows to disable some optimizations which might affect the accuracy of the result. - * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. - * They currently include: - * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. - */ -#ifndef EIGEN_FAST_MATH -#define EIGEN_FAST_MATH 1 +#ifndef EIGEN_HAS_CXX11_ATOMIC + #if EIGEN_MAX_CPP_VER>=11 && \ + (__has_feature(cxx_atomic) \ + || (EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700))) + #define EIGEN_HAS_CXX11_ATOMIC 1 + #else + #define EIGEN_HAS_CXX11_ATOMIC 0 + #endif +#endif + +#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL + #if EIGEN_MAX_CPP_VER>=11 && \ + (EIGEN_COMP_CXXVER >= 11 || EIGEN_COMP_MSVC >= 1700) + #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1 + #else + #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0 + #endif +#endif + +// NOTE: the required Apple's clang version is very conservative +// and it could be that XCode 9 works just fine. +// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support +// and not tested. +#ifndef EIGEN_HAS_CXX17_OVERALIGN +#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \ + (EIGEN_COMP_MSVC >= 1912) \ + || (EIGEN_GNUC_AT_LEAST(7,0)) \ + || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500)) \ + || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \ + ) +#define EIGEN_HAS_CXX17_OVERALIGN 1 +#else +#define EIGEN_HAS_CXX17_OVERALIGN 0 +#endif +#endif + +#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR + // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules + #if defined(__NVCC__) + // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr + #ifdef __CUDACC_RELAXED_CONSTEXPR__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif + #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr) + // clang++ always considers constexpr functions as implicitly __host__ __device__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif +#endif + +// Does the compiler support the __int128 and __uint128_t extensions for 128-bit +// integer arithmetic? +// +// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported, +// but we avoid using them in certain cases: +// +// * Building using Clang for Windows, where the Clang runtime library has +// 128-bit support only on LP64 architectures, but Windows is LLP64. +#ifndef EIGEN_HAS_BUILTIN_INT128 +#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG) +#define EIGEN_HAS_BUILTIN_INT128 1 +#else +#define EIGEN_HAS_BUILTIN_INT128 0 +#endif #endif +//------------------------------------------------------------------------------------------ +// Preprocessor programming helpers +//------------------------------------------------------------------------------------------ + +// This macro can be used to prevent from macro expansion, e.g.: +// std::max EIGEN_NOT_A_MACRO(a,b) +#define EIGEN_NOT_A_MACRO + #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl; // concatenate two tokens @@ -503,7 +911,7 @@ // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. #ifndef EIGEN_STRONG_INLINE -#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC +#if (EIGEN_COMP_MSVC || EIGEN_COMP_ICC) && !defined(EIGEN_GPUCC) #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline @@ -518,7 +926,7 @@ // Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval Eigen::MatrixBase::eval() const' // : function body not available // See also bug 1367 -#if EIGEN_GNUC_AT_LEAST(4,2) +#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY) #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline #else #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE @@ -538,12 +946,43 @@ #define EIGEN_PERMISSIVE_EXPR #endif +// GPU stuff + +// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC) +#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC) + // Do not try asserts on device code + #ifndef EIGEN_NO_DEBUG + #define EIGEN_NO_DEBUG + #endif + + #ifdef EIGEN_INTERNAL_DEBUGGING + #undef EIGEN_INTERNAL_DEBUGGING + #endif + + #ifdef EIGEN_EXCEPTIONS + #undef EIGEN_EXCEPTIONS + #endif +#endif + +#if defined(SYCL_DEVICE_ONLY) + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) +// All functions callable from CUDA/HIP code must be qualified with __device__ +#elif defined(EIGEN_GPUCC) + #define EIGEN_DEVICE_FUNC __host__ __device__ +#else + #define EIGEN_DEVICE_FUNC +#endif + + // this macro allows to get rid of linking errors about multiply defined functions. // - static is not very good because it prevents definitions from different object files to be merged. // So static causes the resulting linked executable to be bloated with multiple copies of the same function. // - inline is not perfect either as it unwantedly hints the compiler toward inlining the function. -#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline #ifdef NDEBUG # ifndef EIGEN_NO_DEBUG @@ -553,7 +992,11 @@ // eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89 #ifdef EIGEN_NO_DEBUG - #define eigen_plain_assert(x) + #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device + #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x) + #else + #define eigen_plain_assert(x) + #endif #else #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO namespace Eigen { @@ -627,7 +1070,7 @@ // Suppresses 'unused variable' warnings. namespace Eigen { namespace internal { - template EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {} } } #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); @@ -641,169 +1084,75 @@ namespace Eigen { #endif -//------------------------------------------------------------------------------------------ -// Static and dynamic alignment control -// -// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES -// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. -// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, -// a default value is automatically computed based on architecture, compiler, and OS. +// Acts as a barrier preventing operations involving `X` from crossing. This +// occurs, for example, in the fast rounding trick where a magic constant is +// added then subtracted, which is otherwise compiled away with -ffast-math. // -// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} -// to be used to declare statically aligned buffers. -//------------------------------------------------------------------------------------------ - - -/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. - * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, - * so that vectorization doesn't affect binary compatibility. - * - * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link - * vectorized and non-vectorized code. - */ -#if (defined __CUDACC__) - #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) -#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#elif EIGEN_COMP_MSVC - #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) -#elif EIGEN_COMP_SUNCC - // FIXME not sure about this one: - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#else - #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler -#endif - -// If the user explicitly disable vectorization, then we also disable alignment -#if defined(EIGEN_DONT_VECTORIZE) - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 -#elif defined(EIGEN_VECTORIZE_AVX512) - // 64 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 -#elif defined(__AVX__) - // 32 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 -#else - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 -#endif - - -// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense -#define EIGEN_MIN_ALIGN_BYTES 16 - -// Defined the boundary (in bytes) on which the data needs to be aligned. Note -// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be -// aligned at all regardless of the value of this #define. - -#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. -#endif - -// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated -// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 -#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) - #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES - #undef EIGEN_MAX_STATIC_ALIGN_BYTES - #endif - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 -#endif - -#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES - - // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - - // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable - // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always - // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in - // certain common platform (compiler+architecture combinations) to avoid these problems. - // Only static alignment is really problematic (relies on nonstandard compiler extensions), - // try to keep heap alignment even when we have to disable static alignment. - #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64) - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) - // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. - // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. - // 4.8 and newer seem definitely unaffected. - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #else - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 - #endif - - // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX - #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ - && !EIGEN_GCC3_OR_OLDER \ - && !EIGEN_COMP_SUNCC \ - && !EIGEN_OS_QNX - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 - #else - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 - #endif - - #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT - #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +// See bug 1674 +#if !defined(EIGEN_OPTIMIZATION_BARRIER) + #if EIGEN_COMP_GNUC + // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html: + // X: Any operand whatsoever. + // r: A register operand is allowed provided that it is in a general + // register. + // g: Any register, memory or immediate integer operand is allowed, except + // for registers that are not general registers. + // w: (AArch32/AArch64) Floating point register, Advanced SIMD vector + // register or SVE vector register. + // x: (SSE) Any SSE register. + // (AArch64) Like w, but restricted to registers 0 to 15 inclusive. + // v: (PowerPC) An Altivec vector register. + // wa:(PowerPC) A VSX register. + // + // "X" (uppercase) should work for all cases, though this seems to fail for + // some versions of GCC for arm/aarch64 with + // "error: inconsistent operand constraints in an 'asm'" + // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and + // vectors, otherwise + // "error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector type" + // + // GCC for ppc64le generates an internal compiler error with x/X/g. + // GCC for AVX generates an internal compiler error with X. + // + // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq + // gcc for arm, aarch64, + // gcc for ppc64le, + // both vectors and scalars. + // + // Note that this is restricted to plain types - this will not work + // directly for std::complex, Eigen::half, Eigen::bfloat16. For these, + // you will need to apply to the underlying POD type. + #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT + // This seems to be broken on clang. Packet4f is loaded into a single + // register rather than a vector, zeroing out some entries. Integer + // types also generate a compile error. + // General, Altivec, VSX. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #elif EIGEN_ARCH_ARM_OR_ARM64 + // General, NEON. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #elif EIGEN_ARCH_i386_OR_x86_64 + // General, SSE. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); + #else + // Not implemented for other architectures. + #define EIGEN_OPTIMIZATION_BARRIER(X) + #endif #else - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 + // Not implemented for other compilers. + #define EIGEN_OPTIMIZATION_BARRIER(X) #endif - #endif -// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES -#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. -// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES) -// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). -// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. - - -// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY -#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) -#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) -#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) -#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#if EIGEN_COMP_MSVC + // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362. + // This workaround is ugly, but it does the job. +# define EIGEN_CONST_CONDITIONAL(cond) (void)0, cond #else -#define EIGEN_ALIGN_MAX +# define EIGEN_CONST_CONDITIONAL(cond) cond #endif - -// Dynamic alignment control - -#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 -#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. -#endif - -#ifdef EIGEN_DONT_ALIGN - #ifdef EIGEN_MAX_ALIGN_BYTES - #undef EIGEN_MAX_ALIGN_BYTES - #endif - #define EIGEN_MAX_ALIGN_BYTES 0 -#elif !defined(EIGEN_MAX_ALIGN_BYTES) - #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#endif - -#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#else -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES -#endif - - -#ifndef EIGEN_UNALIGNED_VECTORIZE -#define EIGEN_UNALIGNED_VECTORIZE 1 -#endif - -//---------------------------------------------------------------------- - - #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #define EIGEN_RESTRICT #endif @@ -811,10 +1160,6 @@ namespace Eigen { #define EIGEN_RESTRICT __restrict #endif -#ifndef EIGEN_STACK_ALLOCATION_LIMIT -// 131072 == 128 KB -#define EIGEN_STACK_ALLOCATION_LIMIT 131072 -#endif #ifndef EIGEN_DEFAULT_IO_FORMAT #ifdef EIGEN_MAKING_DOCS @@ -829,8 +1174,23 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) - // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) + +// When compiling CUDA/HIP device code with NVCC or HIPCC +// pull in math functions from the global namespace. +// In host mode, and when device code is compiled with clang, +// use the std versions. +#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_USING_STD(FUNC) using ::FUNC; +#else + #define EIGEN_USING_STD(FUNC) using std::FUNC; +#endif + +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC)) + // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, + // otherwise we get duplicate definition errors + // For later MSVC versions, we require explicit operator= definition, otherwise we get + // use of implicitly deleted operator errors. + // (cf Bugs 920, 1000, 1324, 2291) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) @@ -856,7 +1216,7 @@ namespace Eigen { * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. */ #if EIGEN_HAS_CXX11 -#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default; #else #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) #endif @@ -881,12 +1241,12 @@ namespace Eigen { */ #if EIGEN_HAS_CXX11 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - EIGEN_DEVICE_FUNC Derived() = default; \ - EIGEN_DEVICE_FUNC ~Derived() = default; + Derived() = default; \ + ~Derived() = default; #else #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - EIGEN_DEVICE_FUNC Derived() {}; \ - /* EIGEN_DEVICE_FUNC ~Derived() {}; */ + Derived() {}; \ + /* ~Derived() {}; */ #endif @@ -908,7 +1268,8 @@ namespace Eigen { typedef typename Eigen::internal::ref_selector::type Nested; \ typedef typename Eigen::internal::traits::StorageKind StorageKind; \ typedef typename Eigen::internal::traits::StorageIndex StorageIndex; \ - enum { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ + enum CompileTimeTraits \ + { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ ColsAtCompileTime = Eigen::internal::traits::ColsAtCompileTime, \ Flags = Eigen::internal::traits::Flags, \ SizeAtCompileTime = Base::SizeAtCompileTime, \ @@ -953,6 +1314,14 @@ namespace Eigen { #define EIGEN_IMPLIES(a,b) (!(a) || (b)) +#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC +#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false)) +#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true)) +#else +#define EIGEN_PREDICT_FALSE(x) (x) +#define EIGEN_PREDICT_TRUE(x) (x) +#endif + // the expression type of a standard coefficient wise binary operation #define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \ CwiseBinaryOp< \ @@ -984,14 +1353,14 @@ namespace Eigen { const typename internal::plain_constant_type::type, const EXPR> // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010") -#if EIGEN_COMP_MSVC_STRICT<=1600 +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600) #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type #else #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X #endif #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME))\ (METHOD)(const T& scalar) const { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -1000,7 +1369,7 @@ namespace Eigen { } #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline friend \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME)) \ (METHOD)(const T& scalar, const StorageBaseType& matrix) { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -1013,15 +1382,23 @@ namespace Eigen { EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_EXCEPTIONS +#endif + + #ifdef EIGEN_EXCEPTIONS # define EIGEN_THROW_X(X) throw X # define EIGEN_THROW throw # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef __CUDA_ARCH__ +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() @@ -1041,13 +1418,47 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# if EIGEN_COMP_MSVC +# if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17 // MSVC does not support exception specifications (warning C4290), - // and they are deprecated in c++11 anyway. + // and they are deprecated in c++11 anyway. This is even an error in c++17. # define EIGEN_EXCEPTION_SPEC(X) throw() # else # define EIGEN_EXCEPTION_SPEC(X) throw(X) # endif #endif +#if EIGEN_HAS_VARIADIC_TEMPLATES +// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. +namespace Eigen { +namespace internal { + +inline bool all(){ return true; } + +template +bool all(T t, Ts ... ts){ return t && all(ts...); } + +} +} +#endif + +#if EIGEN_HAS_CXX11_OVERRIDE_FINAL +// provide override and final specifiers if they are available: +# define EIGEN_OVERRIDE override +# define EIGEN_FINAL final +#else +# define EIGEN_OVERRIDE +# define EIGEN_FINAL +#endif + +// Wrapping #pragma unroll in a macro since it is required for SYCL +#if defined(SYCL_DEVICE_ONLY) + #if defined(_MSC_VER) + #define EIGEN_UNROLL_LOOP __pragma(unroll) + #else + #define EIGEN_UNROLL_LOOP _Pragma("unroll") + #endif +#else + #define EIGEN_UNROLL_LOOP +#endif + #endif // EIGEN_MACROS_H diff --git a/inst/include/Eigen/src/Core/util/Memory.h b/inst/include/Eigen/src/Core/util/Memory.h index 291383c5..875318cd 100644 --- a/inst/include/Eigen/src/Core/util/Memory.h +++ b/inst/include/Eigen/src/Core/util/Memory.h @@ -63,14 +63,28 @@ namespace Eigen { namespace internal { -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() { #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); - ::operator new(huge); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // + new int[huge]; + #else + void* unused = ::operator new(huge); + EIGEN_UNUSED_VARIABLE(unused); + #endif #endif } @@ -83,19 +97,26 @@ inline void throw_std_bad_alloc() /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception. */ -inline void* handmade_aligned_malloc(std::size_t size) +EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { - void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES); + eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2"); + + EIGEN_USING_STD(malloc) + void *original = malloc(size+alignment); + if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); *(reinterpret_cast(aligned) - 1) = original; return aligned; } /** \internal Frees memory allocated with handmade_aligned_malloc */ -inline void handmade_aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) { - if (ptr) std::free(*(reinterpret_cast(ptr) - 1)); + if (ptr) { + EIGEN_USING_STD(free) + free(*(reinterpret_cast(ptr) - 1)); + } } /** \internal @@ -114,7 +135,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); - + *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -142,7 +163,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } -#else +#else EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif @@ -156,9 +177,12 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - result = std::malloc(size); + + EIGEN_USING_STD(malloc) + result = malloc(size); + #if EIGEN_DEFAULT_ALIGN_BYTES==16 - eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator."); #endif #else result = handmade_aligned_malloc(size); @@ -174,7 +198,10 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - std::free(ptr); + + EIGEN_USING_STD(free) + free(ptr); + #else handmade_aligned_free(ptr); #endif @@ -187,7 +214,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) */ inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size) { - EIGEN_UNUSED_VARIABLE(old_size); + EIGEN_UNUSED_VARIABLE(old_size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED @@ -218,7 +245,9 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); - void *result = std::malloc(size); + EIGEN_USING_STD(malloc) + void *result = malloc(size); + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +261,8 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { - std::free(ptr); + EIGEN_USING_STD(free) + free(ptr); } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -331,7 +361,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size) { destruct_elements_of_array(ptr, size); - aligned_free(ptr); + Eigen::internal::aligned_free(ptr); } /** \internal Deletes objects constructed with conditional_aligned_new @@ -471,8 +501,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index } /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size - */ -template + */ +template inline Index first_multiple(Index size, Index base) { return ((size+base-1)/base)*base; @@ -493,7 +523,8 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - std::memcpy(target, start, size); + EIGEN_USING_STD(memcpy) + memcpy(target, start, size); } }; @@ -502,7 +533,7 @@ template struct smart_copy_helper { { std::copy(start, end, target); } }; -// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. +// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. template struct smart_memmove_helper; template void smart_memmove(const T* start, const T* end, T* target) @@ -522,19 +553,30 @@ template struct smart_memmove_helper { template struct smart_memmove_helper { static inline void run(const T* start, const T* end, T* target) - { + { if (UIntPtr(target) < UIntPtr(start)) { std::copy(start, end, target); } - else + else { std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T); - std::copy_backward(start, end, target + count); + std::copy_backward(start, end, target + count); } } }; +#if EIGEN_HAS_RVALUE_REFERENCES +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::move(start, end, target); +} +#else +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::copy(start, end, target); +} +#endif /***************************************************************************** *** Implementation of runtime stack allocation (falling back to malloc) *** @@ -542,7 +584,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#ifndef EIGEN_ALLOCA +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -550,6 +592,15 @@ template struct smart_memmove_helper { #endif #endif +// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is +// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because +// the compiler still emits bad code because stack allocation checks use "<=". +// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772 +// is fixed. +#if defined(__clang__) && defined(__thumb__) + #undef EIGEN_ALLOCA +#endif + // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions. template class aligned_stack_memory_handler : noncopyable @@ -561,12 +612,14 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ + EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } + EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits::RequireInitialization && m_ptr) @@ -580,6 +633,60 @@ template class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; +#ifdef EIGEN_ALLOCA + +template::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic + > +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = false; + typedef typename Xpr::Scalar Scalar; + typedef typename nested_eval::type ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) + { + EIGEN_UNUSED_VARIABLE(ptr); + eigen_internal_assert(ptr==0); + } +}; + +template +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = true; + typedef typename Xpr::Scalar Scalar; + typedef typename plain_object_eval::type PlainObject; + typedef Map ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) + : object(ptr==0 ? reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), + m_deallocate(ptr==0) + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::construct_elements_of_array(object.data(), object.size()); + object = xpr; + } + + EIGEN_DEVICE_FUNC + ~local_nested_eval_wrapper() + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::destruct_elements_of_array(object.data(), object.size()); + if(m_deallocate) + Eigen::internal::aligned_free(object.data()); + } + +private: + bool m_deallocate; +}; + +#endif // EIGEN_ALLOCA + template class scoped_array : noncopyable { T* m_ptr; @@ -603,13 +710,15 @@ template void swap(scoped_array &a,scoped_array &b) { std::swap(a.ptr(),b.ptr()); } - + } // end namespace internal /** \internal - * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. + * + * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, + * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -620,9 +729,17 @@ template void swap(scoped_array &a,scoped_array &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. + * + * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to + * \code + * typename internal::nested_eval::type NAME(XPR); + * \endcode + * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. + * This is accomplished through alloca if this later is supported and if the required number of bytes + * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA - + #if EIGEN_DEFAULT_ALIGN_BYTES>0 // We always manually re-align the result of EIGEN_ALLOCA. // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment. @@ -639,13 +756,23 @@ template void swap(scoped_array &a,scoped_array &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) + + #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ + Eigen::internal::local_nested_eval_wrapper EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast( \ + ( (Eigen::internal::local_nested_eval_wrapper::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ + ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ + typename Eigen::internal::local_nested_eval_wrapper::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) + #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ Eigen::internal::check_size_for_overflow(SIZE); \ TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) - + + +#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval::type NAME(XPR) + #endif @@ -653,32 +780,56 @@ template void swap(scoped_array &a,scoped_array &b) *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF] *** *****************************************************************************/ -#if EIGEN_MAX_ALIGN_BYTES!=0 +#if EIGEN_HAS_CXX17_OVERALIGN + +// C++17 -> no need to bother about alignment anymore :) + +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) + +#else + +// HIP does not support new/delete on device. +#if EIGEN_MAX_ALIGN_BYTES!=0 && !defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc(size); } \ EIGEN_CATCH (...) { return 0; } \ } #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void *operator new(std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ + EIGEN_DEVICE_FUNC \ void *operator new[](std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ /* in-place new and delete. since (at least afaik) there is no actual */ \ /* memory allocated we can safely let the default implementation handle */ \ /* this particular case. */ \ + EIGEN_DEVICE_FUNC \ static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \ + EIGEN_DEVICE_FUNC \ static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \ /* nothrow-new (returns zero instead of std::bad_alloc) */ \ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \ Eigen::internal::conditional_aligned_free(ptr); \ } \ @@ -688,8 +839,14 @@ template void swap(scoped_array &a,scoped_array &b) #endif #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0))) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool( \ + ((Size)!=Eigen::Dynamic) && \ + (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES )==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0)) ))) + +#endif /****************************************************************************/ @@ -703,13 +860,13 @@ template void swap(scoped_array &a,scoped_array &b) * - 32 bytes alignment if AVX is enabled. * - 64 bytes alignment if AVX512 is enabled. * -* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented * \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code * // Matrix4f requires 16 bytes alignment: -* std::map< int, Matrix4f, std::less, +* std::map< int, Matrix4f, std::less, * aligned_allocator > > my_map_mat4; * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator: * std::map< int, Vector3f > my_map_vec3; @@ -744,18 +901,19 @@ class aligned_allocator : public std::allocator ~aligned_allocator() {} + #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) + // In gcc std::allocator::max_size() is bugged making gcc triggers a warning: + // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 + size_type max_size() const { + return (std::numeric_limits::max)()/sizeof(T); + } + #endif + pointer allocate(size_type num, const void* /*hint*/ = 0) { internal::check_size_for_overflow(num); - size_type size = num * sizeof(T); -#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) - // workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 - // It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 - if(size>=std::size_t((std::numeric_limits::max)())) - return 0; - else -#endif - return static_cast( internal::aligned_malloc(size) ); + return static_cast( internal::aligned_malloc(num * sizeof(T)) ); } void deallocate(pointer p, size_type /*num*/) @@ -914,20 +1072,32 @@ inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) { if(max_std_funcs>=4) queryCacheSizes_intel_direct(l1,l2,l3); - else + else if(max_std_funcs>=2) queryCacheSizes_intel_codes(l1,l2,l3); + else + l1 = l2 = l3 = 0; } inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) { int abcd[4]; abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; - EIGEN_CPUID(abcd,0x80000005,0); - l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB - abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; - EIGEN_CPUID(abcd,0x80000006,0); - l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB - l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB + + // First query the max supported function. + EIGEN_CPUID(abcd,0x80000000,0); + if(static_cast(abcd[0]) >= static_cast(0x80000006)) + { + EIGEN_CPUID(abcd,0x80000005,0); + l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB + abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; + EIGEN_CPUID(abcd,0x80000006,0); + l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB + l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB + } + else + { + l1 = l2 = l3 = 0; + } } #endif @@ -943,7 +1113,7 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) // identify the CPU vendor EIGEN_CPUID(abcd,0x0,0); - int max_std_funcs = abcd[1]; + int max_std_funcs = abcd[0]; if(cpuid_is_vendor(abcd,GenuineIntel)) queryCacheSizes_intel(l1,l2,l3,max_std_funcs); else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) diff --git a/inst/include/Eigen/src/Core/util/Meta.h b/inst/include/Eigen/src/Core/util/Meta.h index 9b61ff03..81ae2a32 100755 --- a/inst/include/Eigen/src/Core/util/Meta.h +++ b/inst/include/Eigen/src/Core/util/Meta.h @@ -11,13 +11,54 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(__CUDA_ARCH__) -#include -#include +#if defined(EIGEN_GPU_COMPILE_PHASE) + + #include + + #if defined(EIGEN_CUDA_ARCH) + #include + #endif + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + #endif -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +// Recent versions of ICC require for pointer types below. +#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11) + +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT #include +namespace Eigen { +namespace numext { +typedef std::uint8_t uint8_t; +typedef std::int8_t int8_t; +typedef std::uint16_t uint16_t; +typedef std::int16_t int16_t; +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provide the C99 stdint.h header file. +#include +namespace Eigen { +namespace numext { +typedef ::uint8_t uint8_t; +typedef ::int8_t int8_t; +typedef ::uint16_t uint16_t; +typedef ::int16_t int16_t; +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} #endif namespace Eigen { @@ -43,26 +84,33 @@ namespace internal { // Only recent versions of ICC complain about using ptrdiff_t to hold pointers, // and older versions do not provide *intptr_t types. -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +#if EIGEN_ICC_NEEDS_CSTDINT typedef std::intptr_t IntPtr; typedef std::uintptr_t UIntPtr; #else typedef std::ptrdiff_t IntPtr; typedef std::size_t UIntPtr; #endif +#undef EIGEN_ICC_NEEDS_CSTDINT struct true_type { enum { value = 1 }; }; struct false_type { enum { value = 0 }; }; +template +struct bool_constant; + +template<> +struct bool_constant : true_type {}; + +template<> +struct bool_constant : false_type {}; + template struct conditional { typedef Then type; }; template struct conditional { typedef Else type; }; -template struct is_same { enum { value = 0 }; }; -template struct is_same { enum { value = 1 }; }; - template struct remove_reference { typedef T type; }; template struct remove_reference { typedef T type; }; @@ -97,23 +145,31 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +template struct is_same { enum { value = 0 }; }; +template struct is_same { enum { value = 1 }; }; + +template< class T > +struct is_void : is_same::type> {}; + #if EIGEN_HAS_CXX11 +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; using std::is_integral; #else -template struct is_integral { enum { value = false }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; +template struct is_integral { enum { value = false }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; #if EIGEN_COMP_MSVC -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral{ enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; #endif #endif @@ -137,6 +193,16 @@ template<> struct make_unsigned { typedef unsigned long type; template<> struct make_unsigned { typedef unsigned __int64 type; }; template<> struct make_unsigned { typedef unsigned __int64 type; }; #endif + +// Some platforms define int64_t as `long long` even for C++03, where +// `long long` is not guaranteed by the standard. In this case we are missing +// the definition for make_unsigned. If we just define it, we run into issues +// where `long long` doesn't exist in some compilers for C++03. We therefore add +// the specialization for these platforms only. +#if EIGEN_OS_MAC || EIGEN_COMP_MINGW +template<> struct make_unsigned { typedef unsigned long long type; }; +template<> struct make_unsigned { typedef unsigned long long type; }; +#endif #endif template struct add_const { typedef const T type; }; @@ -151,6 +217,11 @@ template struct add_const_on_value_type { typedef T const template struct add_const_on_value_type { typedef T const* const type; }; template struct add_const_on_value_type { typedef T const* const type; }; +#if EIGEN_HAS_CXX11 + +using std::is_convertible; + +#else template struct is_convertible_impl @@ -164,16 +235,19 @@ struct is_convertible_impl struct yes {int a[1];}; struct no {int a[2];}; - static yes test(const To&, int); + template + static yes test(T, int); + + template static no test(any_conversion, ...); public: - static From ms_from; + static typename internal::remove_reference::type* ms_from; #ifdef __INTEL_COMPILER #pragma warning push #pragma warning ( disable : 2259 ) #endif - enum { value = sizeof(test(ms_from, 0))==sizeof(yes) }; + enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; #ifdef __INTEL_COMPILER #pragma warning pop #endif @@ -182,10 +256,17 @@ struct is_convertible_impl template struct is_convertible { - enum { value = is_convertible_impl::type, - typename remove_all::type>::value }; + enum { value = is_convertible_impl::value }; }; +template +struct is_convertible { enum { value = false }; }; + +template +struct is_convertible { enum { value = true }; }; + +#endif + /** \internal Allows to enable/disable an overload * according to a compile time condition. */ @@ -194,7 +275,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -205,7 +286,7 @@ namespace device { template struct numeric_limits { EIGEN_DEVICE_FUNC - static T epsilon() { return 0; } + static EIGEN_CONSTEXPR T epsilon() { return 0; } static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } static T infinity() { assert(false && "Infinity not supported for this type"); } @@ -213,91 +294,130 @@ template struct numeric_limits }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } - EIGEN_DEVICE_FUNC + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double epsilon() { return __DBL_EPSILON__; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double (max)() { return DBL_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int (max)() { return INT_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int (min)() { return INT_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int (max)() { return UINT_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int (min)() { return 0; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long (max)() { return LONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long (min)() { return LONG_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long (max)() { return ULONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long (min)() { return 0; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long (max)() { return LLONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long (min)() { return LLONG_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long (max)() { return ULLONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long (min)() { return 0; } }; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool epsilon() { return false; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool (max)() { return true; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool (min)() { return false; } +}; } -#endif +#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 /** \internal - * A base class do disable default copy ctor and copy assignement operator. + * A base class do disable default copy ctor and copy assignment operator. */ class noncopyable { @@ -309,13 +429,82 @@ class noncopyable }; /** \internal - * Convenient struct to get the result type of a unary or binary functor. + * Provides access to the number of elements in the object of as a compile-time constant expression. + * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default). + * + * Similar to std::tuple_size, but more general. + * + * It currently supports: + * - any types T defining T::SizeAtCompileTime + * - plain C arrays as T[N] + * - std::array (c++11) + * - some internal types such as SingleRange and AllRange + * + * The second template parameter eases SFINAE-based specializations. + */ +template struct array_size { + enum { value = Dynamic }; +}; + +template struct array_size::type> { + enum { value = T::SizeAtCompileTime }; +}; + +template struct array_size { + enum { value = N }; +}; +template struct array_size { + enum { value = N }; +}; + +#if EIGEN_HAS_CXX11 +template struct array_size > { + enum { value = N }; +}; +template struct array_size > { + enum { value = N }; +}; +#endif + +/** \internal + * Analogue of the std::size free function. + * It returns the size of the container or view \a x of type \c T + * + * It currently supports: + * - any types T defining a member T::size() const + * - plain C arrays as T[N] * - * It supports both the current STL mechanism (using the result_type member) as well as - * upcoming next STL generation (using a templated result member). - * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack. */ -#if EIGEN_HAS_STD_RESULT_OF +template +EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } + +template +EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } + +/** \internal + * Convenient struct to get the result type of a nullary, unary, binary, or + * ternary functor. + * + * Pre C++11: + * Supports both a Func::result_type member and templated + * Func::result::type member. + * + * If none of these members is provided, then the type of the first + * argument is returned. + * + * Post C++11: + * This uses std::result_of. However, note the `type` member removes + * const and converts references/pointers to their corresponding value type. + */ +#if EIGEN_HAS_STD_INVOKE_RESULT +template struct result_of; + +template +struct result_of { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_STD_RESULT_OF template struct result_of { typedef typename std::result_of::type type1; typedef typename remove_all::type type; @@ -327,6 +516,28 @@ struct has_none {int a[1];}; struct has_std_result_type {int a[2];}; struct has_tr1_result {int a[3];}; +template +struct nullary_result_of_select {}; + +template +struct nullary_result_of_select {typedef typename Func::result_type type;}; + +template +struct nullary_result_of_select {typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename nullary_result_of_select::type type; +}; + template struct unary_result_of_select {typedef typename internal::remove_all::type type;}; @@ -396,6 +607,45 @@ struct result_of { enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; typedef typename ternary_result_of_select::type type; }; + +#endif + +#if EIGEN_HAS_STD_INVOKE_RESULT +template +struct invoke_result { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_CXX11 +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; +#else +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; #endif struct meta_yes { char a[1]; }; @@ -405,10 +655,10 @@ struct meta_no { char a[2]; }; template struct has_ReturnType { - template static meta_yes testFunctor(typename C::ReturnType const *); - template static meta_no testFunctor(...); + template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); + template static meta_no testFunctor(...); - enum { value = sizeof(testFunctor(0)) == sizeof(meta_yes) }; + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; }; template const T* return_ptr(); @@ -465,20 +715,25 @@ class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? /** \internal Computes the least common multiple of two positive integer A and B - * at compile-time. It implements a naive algorithm testing all multiples of A. - * It thus works better if A>=B. + * at compile-time. */ -template +template=B)> struct meta_least_common_multiple { enum { ret = meta_least_common_multiple::ret }; }; +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; template -struct meta_least_common_multiple +struct meta_least_common_multiple { enum { ret = A*K }; }; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template struct scalar_product_traits { @@ -491,17 +746,27 @@ template struct scalar_product_traits // typedef typename scalar_product_traits::type, typename remove_all::type>::ReturnType type; // }; +/** \internal Obtains a POD type suitable to use as storage for an object of a size + * of at most Len bytes, aligned as specified by \c Align. + */ +template +struct aligned_storage { + struct type { + EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len]; + }; +}; + } // end namespace internal namespace numext { - -#if defined(__CUDA_ARCH__) + +#if defined(EIGEN_GPU_COMPILE_PHASE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 using internal::device::numeric_limits; #else using std::numeric_limits; @@ -510,6 +775,7 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template +EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; @@ -517,52 +783,30 @@ T div_ceil(const T &a, const T &b) // The aim of the following functions is to bypass -Wfloat-equal warnings // when we really want a strict equality comparison on floating points. -template EIGEN_STRONG_INLINE +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -template<> EIGEN_STRONG_INLINE +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } -template<> EIGEN_STRONG_INLINE +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } +#endif -template EIGEN_STRONG_INLINE +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -template<> EIGEN_STRONG_INLINE +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } -template<> EIGEN_STRONG_INLINE +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } +#endif } // end namespace numext } // end namespace Eigen -// Define portable (u)int{32,64} types -#if EIGEN_HAS_CXX11 -#include -namespace Eigen { -namespace numext { -typedef std::uint32_t uint32_t; -typedef std::int32_t int32_t; -typedef std::uint64_t uint64_t; -typedef std::int64_t int64_t; -} -} -#else -// Without c++11, all compilers able to compile Eigen also -// provides the C99 stdint.h header file. -#include -namespace Eigen { -namespace numext { -typedef ::uint32_t uint32_t; -typedef ::int32_t int32_t; -typedef ::uint64_t uint64_t; -typedef ::int64_t int64_t; -} -} -#endif - - #endif // EIGEN_META_H diff --git a/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h b/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h index 1ce6fd1b..bfd72d05 100644 --- a/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -11,9 +11,9 @@ #elif defined __INTEL_COMPILER #pragma warning pop #elif defined __clang__ - #pragma clang diagnostic pop + // #pragma clang diagnostic pop #elif defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) - #pragma GCC diagnostic pop + // #pragma GCC diagnostic pop #endif #if defined __NVCC__ diff --git a/inst/include/Eigen/src/Core/util/ReshapedHelper.h b/inst/include/Eigen/src/Core/util/ReshapedHelper.h new file mode 100644 index 00000000..41243213 --- /dev/null +++ b/inst/include/Eigen/src/Core/util/ReshapedHelper.h @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_RESHAPED_HELPER_H +#define EIGEN_RESHAPED_HELPER_H + +namespace Eigen { + +enum AutoSize_t { AutoSize }; +const int AutoOrder = 2; + +namespace internal { + +template +struct get_compiletime_reshape_size { + enum { value = get_fixed_value::value }; +}; + +template +Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) { + return internal::get_runtime_value(size); +} + +template +struct get_compiletime_reshape_size { + enum { + other_size = get_fixed_value::value, + value = (TotalSize==Dynamic || other_size==Dynamic) ? Dynamic : TotalSize / other_size }; +}; + +inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { + return total/other; +} + +template +struct get_compiletime_reshape_order { + enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order }; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_RESHAPED_HELPER_H diff --git a/inst/include/Eigen/src/Core/util/StaticAssert.h b/inst/include/Eigen/src/Core/util/StaticAssert.h index 500e4779..c45de590 100644 --- a/inst/include/Eigen/src/Core/util/StaticAssert.h +++ b/inst/include/Eigen/src/Core/util/StaticAssert.h @@ -27,7 +27,7 @@ #ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT - #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) + #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) // if native static_assert is enabled, let's use it #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); @@ -103,7 +103,10 @@ STORAGE_KIND_MUST_MATCH=1, STORAGE_INDEX_MUST_MATCH=1, CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, - SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1 + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, + INVALID_TEMPLATE_PARAMETER=1, + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1, + THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1 }; }; @@ -182,7 +185,7 @@ ) #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \ - EIGEN_STATIC_ASSERT(!NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) + EIGEN_STATIC_ASSERT(!Eigen::NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes @@ -192,8 +195,8 @@ YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES) #define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \ - EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \ - (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \ + EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \ + (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \ THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS) #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \ diff --git a/inst/include/Eigen/src/Core/util/SymbolicIndex.h b/inst/include/Eigen/src/Core/util/SymbolicIndex.h new file mode 100644 index 00000000..354dd9ad --- /dev/null +++ b/inst/include/Eigen/src/Core/util/SymbolicIndex.h @@ -0,0 +1,293 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SYMBOLIC_INDEX_H +#define EIGEN_SYMBOLIC_INDEX_H + +namespace Eigen { + +/** \namespace Eigen::symbolic + * \ingroup Core_Module + * + * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index. + * Here is a simple example: + * + * \code + * // First step, defines symbols: + * struct x_tag {}; static const symbolic::SymbolExpr x; + * struct y_tag {}; static const symbolic::SymbolExpr y; + * struct z_tag {}; static const symbolic::SymbolExpr z; + * + * // Defines an expression: + * auto expr = (x+3)/y+z; + * + * // And evaluate it: (c++14) + * std::cout << expr.eval(x=6,y=3,z=-13) << "\n"; + * + * // In c++98/11, only one symbol per expression is supported for now: + * auto expr98 = (3-x)/2; + * std::cout << expr98.eval(x=6) << "\n"; + * \endcode + * + * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN. + * + */ +namespace symbolic { + +template class Symbol; +template class NegateExpr; +template class AddExpr; +template class ProductExpr; +template class QuotientExpr; + +// A simple wrapper around an integral value to provide the eval method. +// We could also use a free-function symbolic_eval... +template +class ValueExpr { +public: + ValueExpr(IndexType val) : m_value(val) {} + template + IndexType eval_impl(const T&) const { return m_value; } +protected: + IndexType m_value; +}; + +// Specialization for compile-time value, +// It is similar to ValueExpr(N) but this version helps the compiler to generate better code. +template +class ValueExpr > { +public: + ValueExpr() {} + template + EIGEN_CONSTEXPR Index eval_impl(const T&) const { return N; } +}; + + +/** \class BaseExpr + * \ingroup Core_Module + * Common base class of any symbolic expressions + */ +template +class BaseExpr +{ +public: + const Derived& derived() const { return *static_cast(this); } + + /** Evaluate the expression given the \a values of the symbols. + * + * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue + * as constructed by SymbolExpr::operator= operator. + * + */ + template + Index eval(const T& values) const { return derived().eval_impl(values); } + +#if EIGEN_HAS_CXX14 + template + Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); } +#endif + + NegateExpr operator-() const { return NegateExpr(derived()); } + + AddExpr > operator+(Index b) const + { return AddExpr >(derived(), b); } + AddExpr > operator-(Index a) const + { return AddExpr >(derived(), -a); } + ProductExpr > operator*(Index a) const + { return ProductExpr >(derived(),a); } + QuotientExpr > operator/(Index a) const + { return QuotientExpr >(derived(),a); } + + friend AddExpr > operator+(Index a, const BaseExpr& b) + { return AddExpr >(b.derived(), a); } + friend AddExpr,ValueExpr<> > operator-(Index a, const BaseExpr& b) + { return AddExpr,ValueExpr<> >(-b.derived(), a); } + friend ProductExpr,Derived> operator*(Index a, const BaseExpr& b) + { return ProductExpr,Derived>(a,b.derived()); } + friend QuotientExpr,Derived> operator/(Index a, const BaseExpr& b) + { return QuotientExpr,Derived>(a,b.derived()); } + + template + AddExpr > > operator+(internal::FixedInt) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + AddExpr > > operator-(internal::FixedInt) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + ProductExpr > > operator*(internal::FixedInt) const + { return ProductExpr > >(derived(),ValueExpr >()); } + template + QuotientExpr > > operator/(internal::FixedInt) const + { return QuotientExpr > >(derived(),ValueExpr >()); } + + template + friend AddExpr > > operator+(internal::FixedInt, const BaseExpr& b) + { return AddExpr > >(b.derived(), ValueExpr >()); } + template + friend AddExpr,ValueExpr > > operator-(internal::FixedInt, const BaseExpr& b) + { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } + template + friend ProductExpr >,Derived> operator*(internal::FixedInt, const BaseExpr& b) + { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } + template + friend QuotientExpr >,Derived> operator/(internal::FixedInt, const BaseExpr& b) + { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } + +#if (!EIGEN_HAS_CXX14) + template + AddExpr > > operator+(internal::FixedInt (*)()) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + AddExpr > > operator-(internal::FixedInt (*)()) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + ProductExpr > > operator*(internal::FixedInt (*)()) const + { return ProductExpr > >(derived(),ValueExpr >()); } + template + QuotientExpr > > operator/(internal::FixedInt (*)()) const + { return QuotientExpr > >(derived(),ValueExpr >()); } + + template + friend AddExpr > > operator+(internal::FixedInt (*)(), const BaseExpr& b) + { return AddExpr > >(b.derived(), ValueExpr >()); } + template + friend AddExpr,ValueExpr > > operator-(internal::FixedInt (*)(), const BaseExpr& b) + { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } + template + friend ProductExpr >,Derived> operator*(internal::FixedInt (*)(), const BaseExpr& b) + { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } + template + friend QuotientExpr >,Derived> operator/(internal::FixedInt (*)(), const BaseExpr& b) + { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } +#endif + + + template + AddExpr operator+(const BaseExpr &b) const + { return AddExpr(derived(), b.derived()); } + + template + AddExpr > operator-(const BaseExpr &b) const + { return AddExpr >(derived(), -b.derived()); } + + template + ProductExpr operator*(const BaseExpr &b) const + { return ProductExpr(derived(), b.derived()); } + + template + QuotientExpr operator/(const BaseExpr &b) const + { return QuotientExpr(derived(), b.derived()); } +}; + +template +struct is_symbolic { + // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr. + enum { value = internal::is_convertible >::value }; +}; + +/** Represents the actual value of a symbol identified by its tag + * + * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used. + */ +template +class SymbolValue +{ +public: + /** Default constructor from the value \a val */ + SymbolValue(Index val) : m_value(val) {} + + /** \returns the stored value of the symbol */ + Index value() const { return m_value; } +protected: + Index m_value; +}; + +/** Expression of a symbol uniquely identified by the template parameter type \c tag */ +template +class SymbolExpr : public BaseExpr > +{ +public: + /** Alias to the template parameter \c tag */ + typedef tag Tag; + + SymbolExpr() {} + + /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag. + * + * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified runtime-time value. + */ + SymbolValue operator=(Index val) const { + return SymbolValue(val); + } + + Index eval_impl(const SymbolValue &values) const { return values.value(); } + +#if EIGEN_HAS_CXX14 + // C++14 versions suitable for multiple symbols + template + Index eval_impl(const std::tuple& values) const { return std::get >(values).value(); } +#endif +}; + +template +class NegateExpr : public BaseExpr > +{ +public: + NegateExpr(const Arg0& arg0) : m_arg0(arg0) {} + + template + Index eval_impl(const T& values) const { return -m_arg0.eval_impl(values); } +protected: + Arg0 m_arg0; +}; + +template +class AddExpr : public BaseExpr > +{ +public: + AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) + m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +template +class ProductExpr : public BaseExpr > +{ +public: + ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) * m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +template +class QuotientExpr : public BaseExpr > +{ +public: + QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) / m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +} // end namespace symbolic + +} // end namespace Eigen + +#endif // EIGEN_SYMBOLIC_INDEX_H diff --git a/inst/include/Eigen/src/Core/util/XprHelper.h b/inst/include/Eigen/src/Core/util/XprHelper.h index 6bb49708..71c32b8a 100644 --- a/inst/include/Eigen/src/Core/util/XprHelper.h +++ b/inst/include/Eigen/src/Core/util/XprHelper.h @@ -49,6 +49,12 @@ template struct is_valid_index_type }; }; +// true if both types are not valid index types +template +struct valid_indexed_view_overload { + enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; +}; + // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression. @@ -123,19 +129,23 @@ struct promote_index_type template class variable_if_dynamic { public: - EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + operator T() const { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } }; template class variable_if_dynamic { T m_value; - EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); } public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; } }; @@ -146,8 +156,10 @@ template class variable_if_dynamicindex public: EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void setValue(T) {} }; template class variable_if_dynamicindex @@ -172,16 +184,7 @@ template struct functor_traits template struct packet_traits; -template struct unpacket_traits -{ - typedef T type; - typedef T half; - enum - { - size = 1, - alignment = 1 - }; -}; +template struct unpacket_traits; template::size)==0 || is_same::half>::value> @@ -400,7 +403,7 @@ template struct plain_matrix_type_row_major typedef Matrix::Scalar, Rows, Cols, - (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor, + (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor, MaxRows, MaxCols > type; @@ -417,7 +420,7 @@ struct ref_selector T const&, const T >::type type; - + typedef typename conditional< bool(traits::Flags & NestByRefBit), T &, @@ -455,7 +458,7 @@ template { enum { ScalarReadCost = NumTraits::Scalar>::ReadCost, - CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? + CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. @@ -596,14 +599,14 @@ template MatrixRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType; typedef Array ArrayRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType; typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixRowType, - ArrayRowType + ArrayRowType >::type type; }; @@ -618,7 +621,7 @@ struct plain_col_type typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixColType, - ArrayColType + ArrayColType >::type type; }; @@ -634,7 +637,7 @@ struct plain_diag_type typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixDiagType, - ArrayDiagType + ArrayDiagType >::type type; }; @@ -671,24 +674,39 @@ template struct is_diagonal > template struct is_diagonal > { enum { ret = true }; }; + +template struct is_identity +{ enum { value = false }; }; + +template struct is_identity, T> > +{ enum { value = true }; }; + + template struct glue_shapes; template<> struct glue_shapes { typedef TriangularShape type; }; template -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::ret&&has_direct_access::ret, T1>::type * = 0) +struct possibly_same_dense { + enum { value = has_direct_access::ret && has_direct_access::ret && is_same::value }; +}; + +template +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template -bool is_same_dense(const T1 &, const T2 &, typename enable_if::ret&&has_direct_access::ret), T1>::type * = 0) +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) { return false; } // Internal helper defining the cost of a scalar division for the type T. // The default heuristic can be specialized for each scalar type and architecture. -template +template struct scalar_div_cost { enum { value = 8*NumTraits::MulCost }; }; @@ -735,7 +753,7 @@ std::string demangle_flags(int f) if(f&DirectAccessBit) res += " | Direct"; if(f&NestByRefBit) res += " | NestByRef"; if(f&NoPreferredStorageOrderBit) res += " | NoPreferredStorageOrderBit"; - + return res; } #endif @@ -832,7 +850,7 @@ struct ScalarBinaryOpTraits #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \ EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType >::value), \ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - + } // end namespace Eigen #endif // EIGEN_XPRHELPER_H diff --git a/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h index dc5fae06..081e918f 100644 --- a/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -214,7 +214,7 @@ template class ComplexEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h b/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h index 4354e401..fc71468f 100644 --- a/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h @@ -212,7 +212,7 @@ template class ComplexSchur /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/inst/include/Eigen/src/Eigenvalues/EigenSolver.h b/inst/include/Eigen/src/Eigenvalues/EigenSolver.h index f205b185..572b29e4 100644 --- a/inst/include/Eigen/src/Eigenvalues/EigenSolver.h +++ b/inst/include/Eigen/src/Eigenvalues/EigenSolver.h @@ -110,7 +110,7 @@ template class EigenSolver * * \sa compute() for an example. */ - EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {} + EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {} /** \brief Default constructor with memory preallocation * @@ -277,7 +277,7 @@ template class EigenSolver template EigenSolver& compute(const EigenBase& matrix, bool computeEigenvectors = true); - /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); diff --git a/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index 5f6bb828..d0f9091b 100644 --- a/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT * * \returns Reference to \c *this * - * Accoring to \p options, this function computes eigenvalues and (if requested) + * According to \p options, this function computes eigenvalues and (if requested) * the eigenvectors of one of the following three generalized eigenproblems: * - \c Ax_lBx: \f$ Ax = \lambda B x \f$ * - \c ABx_lx: \f$ ABx = \lambda x \f$ diff --git a/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h index f647f69b..1f211393 100644 --- a/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -267,7 +267,7 @@ template class HessenbergDecomposition private: - typedef Matrix VectorType; + typedef Matrix VectorType; typedef typename NumTraits::Real RealScalar; static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp); @@ -315,7 +315,7 @@ void HessenbergDecomposition::_compute(MatrixType& matA, CoeffVector // A = A H' matA.rightCols(remainingSize) - .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0)); + .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0)); } } diff --git a/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index e4e42607..66e5a3db 100644 --- a/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -84,7 +84,7 @@ MatrixBase::eigenvalues() const * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues() */ template -inline typename SelfAdjointView::EigenvaluesReturnType +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { PlainObject thisAsMatrix(*this); @@ -147,7 +147,7 @@ MatrixBase::operatorNorm() const * \sa eigenvalues(), MatrixBase::operatorNorm() */ template -inline typename SelfAdjointView::RealScalar +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::RealScalar SelfAdjointView::operatorNorm() const { return eigenvalues().cwiseAbs().maxCoeff(); diff --git a/inst/include/Eigen/src/Eigenvalues/RealQZ.h b/inst/include/Eigen/src/Eigenvalues/RealQZ.h index b3a910dd..50913018 100644 --- a/inst/include/Eigen/src/Eigenvalues/RealQZ.h +++ b/inst/include/Eigen/src/Eigenvalues/RealQZ.h @@ -90,8 +90,9 @@ namespace Eigen { m_Z(size, size), m_workspace(size*2), m_maxIters(400), - m_isInitialized(false) - { } + m_isInitialized(false), + m_computeQZ(true) + {} /** \brief Constructor; computes real QZ decomposition of given matrices * @@ -108,9 +109,11 @@ namespace Eigen { m_Z(A.rows(),A.cols()), m_workspace(A.rows()*2), m_maxIters(400), - m_isInitialized(false) { - compute(A, B, computeQZ); - } + m_isInitialized(false), + m_computeQZ(true) + { + compute(A, B, computeQZ); + } /** \brief Returns matrix Q in the QZ decomposition. * @@ -161,7 +164,7 @@ namespace Eigen { /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/inst/include/Eigen/src/Eigenvalues/RealSchur.h b/inst/include/Eigen/src/Eigenvalues/RealSchur.h index 9191519a..7304ef34 100644 --- a/inst/include/Eigen/src/Eigenvalues/RealSchur.h +++ b/inst/include/Eigen/src/Eigenvalues/RealSchur.h @@ -190,7 +190,7 @@ template class RealSchur RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU); /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { @@ -270,8 +270,13 @@ RealSchur& RealSchur::compute(const EigenBase // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); - // Step 2. Reduce to real Schur form - computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU); + // Step 2. Reduce to real Schur form + // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg + // to be able to pass our working-space buffer for the Householder to Dense evaluation. + m_workspaceVector.resize(matrix.cols()); + if(computeU) + m_hess.matrixQ().evalTo(m_matU, m_workspaceVector); + computeFromHessenberg(m_hess.matrixH(), m_matU, computeU); m_matT *= scale; @@ -284,13 +289,13 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa using std::abs; m_matT = matrixH; - if(computeU) + m_workspaceVector.resize(m_matT.cols()); + if(computeU && !internal::is_same_dense(m_matU,matrixQ)) m_matU = matrixQ; Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrixH.rows(); - m_workspaceVector.resize(m_matT.cols()); Scalar* workspace = &m_workspaceVector.coeffRef(0); // The matrix m_matT is divided in three parts. diff --git a/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index d37656fa..14692365 100644 --- a/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template struct direct_selfadjoint_eigenvalues; + template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -42,10 +44,14 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag * \f$ v \f$ such that \f$ Av = \lambda v \f$. The eigenvalues of a * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the - * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint - * matrices, the matrix \f$ V \f$ is always invertible). This is called the + * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$. This is called the * eigendecomposition. * + * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal + * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then + * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is + * equal to its transpose, \f$ V^{-1} = V^T \f$. + * * The algorithm exploits the fact that the matrix is selfadjoint, making it * faster and more accurate than the general purpose eigenvalue algorithms * implemented in EigenSolver and ComplexEigenSolver. @@ -119,7 +125,10 @@ template class SelfAdjointEigenSolver : m_eivec(), m_eivalues(), m_subdiag(), - m_isInitialized(false) + m_hcoeffs(), + m_info(InvalidInput), + m_isInitialized(false), + m_eigenvectorsOk(false) { } /** \brief Constructor, pre-allocates memory for dynamic-size matrices. @@ -139,7 +148,9 @@ template class SelfAdjointEigenSolver : m_eivec(size, size), m_eivalues(size), m_subdiag(size > 1 ? size - 1 : 1), - m_isInitialized(false) + m_hcoeffs(size > 1 ? size - 1 : 1), + m_isInitialized(false), + m_eigenvectorsOk(false) {} /** \brief Constructor; computes eigendecomposition of given matrix. @@ -163,7 +174,9 @@ template class SelfAdjointEigenSolver : m_eivec(matrix.rows(), matrix.cols()), m_eivalues(matrix.cols()), m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1), - m_isInitialized(false) + m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1), + m_isInitialized(false), + m_eigenvectorsOk(false) { compute(matrix.derived(), options); } @@ -250,6 +263,11 @@ template class SelfAdjointEigenSolver * matrix \f$ A \f$, then the matrix returned by this function is the * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$. * + * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal + * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then + * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is + * equal to its transpose, \f$ V^{-1} = V^T \f$. + * * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out * @@ -337,7 +355,7 @@ template class SelfAdjointEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ EIGEN_DEVICE_FUNC ComputationInfo info() const @@ -354,7 +372,8 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static void check_template_parameters() + static EIGEN_DEVICE_FUNC + void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -362,6 +381,7 @@ template class SelfAdjointEigenSolver EigenvectorsType m_eivec; RealVectorType m_eivalues; typename TridiagonalizationType::SubDiagonalType m_subdiag; + typename TridiagonalizationType::CoeffVectorType m_hcoeffs; ComputationInfo m_info; bool m_isInitialized; bool m_eigenvectorsOk; @@ -403,7 +423,7 @@ ::compute(const EigenBase& a_matrix, int options) const InputType &matrix(a_matrix.derived()); - using std::abs; + EIGEN_USING_STD(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -434,7 +454,8 @@ ::compute(const EigenBase& a_matrix, int options) if(scale==RealScalar(0)) scale = RealScalar(1); mat.template triangularView() /= scale; m_subdiag.resize(n-1); - internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors); + m_hcoeffs.resize(n-1); + internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors); m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec); @@ -479,10 +500,9 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - using std::abs; - ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -493,15 +513,23 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag typedef typename DiagType::RealScalar RealScalar; const RealScalar considerAsZero = (std::numeric_limits::min)(); - const RealScalar precision = RealScalar(2)*NumTraits::epsilon(); - + const RealScalar precision_inv = RealScalar(1)/NumTraits::epsilon(); while (end>0) { - for (Index i = start; i0 && subdiag[end-1]==RealScalar(0)) { end--; @@ -535,7 +563,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - std::swap(diag[i], diag[k+i]); + numext::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -566,10 +594,10 @@ template struct direct_selfadjoint_eigenvalues struct direct_selfadjoint_eigenvalues res, Ref representative) { - EIGEN_USING_STD_MATH(sqrt) - EIGEN_USING_STD_MATH(abs) + EIGEN_USING_STD(abs); + EIGEN_USING_STD(sqrt); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -720,7 +748,7 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) { - using std::sqrt; + EIGEN_USING_STD(sqrt); const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0))); const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1)); roots(0) = t1 - t0; @@ -730,8 +758,8 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void run(SolverType& solver, const MatrixType& mat, int options) { - EIGEN_USING_STD_MATH(sqrt); - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(sqrt); + EIGEN_USING_STD(abs); eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 @@ -804,32 +832,38 @@ ::computeDirect(const MatrixType& matrix, int options) } namespace internal { + +// Francis implicit QR step. template EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - using std::abs; + // Wilkinson Shift. RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still // underflow thus leading to inf/NaN values when using the following commented code: -// RealScalar e2 = numext::abs2(subdiag[end-1]); -// RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2)); + // RealScalar e2 = numext::abs2(subdiag[end-1]); + // RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2)); // This explain the following, somewhat more complicated, version: RealScalar mu = diag[end]; - if(td==RealScalar(0)) - mu -= abs(e); - else - { - RealScalar e2 = numext::abs2(subdiag[end-1]); - RealScalar h = numext::hypot(td,e); - if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h); - else mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); + if(td==RealScalar(0)) { + mu -= numext::abs(e); + } else if (e != RealScalar(0)) { + const RealScalar e2 = numext::abs2(e); + const RealScalar h = numext::hypot(td,e); + if(e2 == RealScalar(0)) { + mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e); + } else { + mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); + } } - + RealScalar x = diag[start] - mu; RealScalar z = subdiag[start]; - for (Index k = start; k < end; ++k) + // If z ever becomes zero, the Givens rotation will be the identity and + // z will stay zero for all future iterations. + for (Index k = start; k < end && z != RealScalar(0); ++k) { JacobiRotation rot; rot.makeGivens(x, z); @@ -842,12 +876,11 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta diag[k+1] = rot.s() * sdk + rot.c() * dkp1; subdiag[k] = rot.c() * sdk - rot.s() * dkp1; - if (k > start) subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z; + // "Chasing the bulge" to return to triangular form. x = subdiag[k]; - if (k < end - 1) { z = -rot.s() * subdiag[k+1]; diff --git a/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h b/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h index 1d102c17..674c92a3 100644 --- a/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -11,10 +11,10 @@ #ifndef EIGEN_TRIDIAGONALIZATION_H #define EIGEN_TRIDIAGONALIZATION_H -namespace Eigen { +namespace Eigen { namespace internal { - + template struct TridiagonalizationMatrixTReturnType; template struct traits > @@ -25,6 +25,7 @@ struct traits > }; template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -344,6 +345,7 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -352,7 +354,7 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) Index n = matA.rows(); eigen_assert(n==matA.cols()); eigen_assert(n==hCoeffs.size()+1 || n==1); - + for (Index i = 0; i -void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) +template +EIGEN_DEVICE_FUNC +void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, + CoeffVectorType& hcoeffs, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); - tridiagonalization_inplace_selector::run(mat, diag, subdiag, extractQ); + tridiagonalization_inplace_selector::run(mat, diag, subdiag, hcoeffs, extractQ); } /** \internal @@ -439,10 +443,10 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { - CoeffVectorType hCoeffs(mat.cols()-1); - tridiagonalization_inplace(mat,hCoeffs); + tridiagonalization_inplace(mat, hCoeffs); diag = mat.diagonal().real(); subdiag = mat.template diagonal<-1>().real(); if(extractQ) @@ -462,8 +466,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + template + static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ) { using std::sqrt; const RealScalar tol = (std::numeric_limits::min)(); @@ -507,8 +511,9 @@ struct tridiagonalization_inplace_selector { typedef typename MatrixType::Scalar Scalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + template + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) @@ -542,8 +547,8 @@ template struct TridiagonalizationMatrixTReturnType result.template diagonal<-1>() = m_matrix.template diagonal<-1>(); } - Index rows() const { return m_matrix.rows(); } - Index cols() const { return m_matrix.cols(); } + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } protected: typename MatrixType::Nested m_matrix; diff --git a/inst/include/Eigen/src/Geometry/AlignedBox.h b/inst/include/Eigen/src/Geometry/AlignedBox.h index 066eae4f..55a9d0ae 100644 --- a/inst/include/Eigen/src/Geometry/AlignedBox.h +++ b/inst/include/Eigen/src/Geometry/AlignedBox.h @@ -7,10 +7,46 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// Function void Eigen::AlignedBox::transform(const Transform& transform) +// is provided under the following license agreement: +// +// Software License Agreement (BSD License) +// +// Copyright (c) 2011-2014, Willow Garage, Inc. +// Copyright (c) 2014-2015, Open Source Robotics Foundation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Open Source Robotics Foundation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + #ifndef EIGEN_ALIGNEDBOX_H #define EIGEN_ALIGNEDBOX_H -namespace Eigen { +namespace Eigen { /** \geometry_module \ingroup Geometry_Module * @@ -63,7 +99,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) /** Default constructor initializing a null box. */ EIGEN_DEVICE_FUNC inline AlignedBox() - { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); } + { if (EIGEN_CONST_CONDITIONAL(AmbientDimAtCompileTime!=Dynamic)) setEmpty(); } /** Constructs a null box with \a _dim the dimension of the ambient space. */ EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim) @@ -231,7 +267,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); } /** Returns an AlignedBox that is the union of \a b and \c *this. - * \note Merging with an empty box may result in a box bigger than \c *this. + * \note Merging with an empty box may result in a box bigger than \c *this. * \sa extend(const AlignedBox&) */ EIGEN_DEVICE_FUNC inline AlignedBox merged(const AlignedBox& b) const { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); } @@ -246,6 +282,15 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) return *this; } + /** \returns a copy of \c *this translated by the vector \a t. */ + template + EIGEN_DEVICE_FUNC inline AlignedBox translated(const MatrixBase& a_t) const + { + AlignedBox result(m_min, m_max); + result.translate(a_t); + return result; + } + /** \returns the squared distance between the point \a p and the box \c *this, * and zero if \a p is inside the box. * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&) @@ -265,14 +310,63 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) */ template EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const MatrixBase& p) const - { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); } + { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); } /** \returns the distance between the boxes \a b and \c *this, * and zero if the boxes intersect. * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&) */ EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const AlignedBox& b) const - { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); } + { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); } + + /** + * Specialization of transform for pure translation. + */ + template + EIGEN_DEVICE_FUNC inline void transform( + const typename Transform::TranslationType& translation) + { + this->translate(translation); + } + + /** + * Transforms this box by \a transform and recomputes it to + * still be an axis-aligned box. + * + * \note This method is provided under BSD license (see the top of this file). + */ + template + EIGEN_DEVICE_FUNC inline void transform(const Transform& transform) + { + // Only Affine and Isometry transforms are currently supported. + EIGEN_STATIC_ASSERT(Mode == Affine || Mode == AffineCompact || Mode == Isometry, THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS); + + // Method adapted from FCL src/shape/geometric_shapes_utility.cpp#computeBV(...) + // https://github.com/flexible-collision-library/fcl/blob/fcl-0.4/src/shape/geometric_shapes_utility.cpp#L292 + // + // Here's a nice explanation why it works: https://zeuxcg.org/2010/10/17/aabb-from-obb-with-component-wise-abs/ + + // two times rotated extent + const VectorType rotated_extent_2 = transform.linear().cwiseAbs() * sizes(); + // two times new center + const VectorType rotated_center_2 = transform.linear() * (this->m_max + this->m_min) + + Scalar(2) * transform.translation(); + + this->m_max = (rotated_center_2 + rotated_extent_2) / Scalar(2); + this->m_min = (rotated_center_2 - rotated_extent_2) / Scalar(2); + } + + /** + * \returns a copy of \c *this transformed by \a transform and recomputed to + * still be an axis-aligned box. + */ + template + EIGEN_DEVICE_FUNC AlignedBox transformed(const Transform& transform) const + { + AlignedBox result(m_min, m_max); + result.transform(transform); + return result; + } /** \returns \c *this with scalar type casted to \a NewScalarType * diff --git a/inst/include/Eigen/src/Geometry/AngleAxis.h b/inst/include/Eigen/src/Geometry/AngleAxis.h index 83ee1be4..78328b6b 100644 --- a/inst/include/Eigen/src/Geometry/AngleAxis.h +++ b/inst/include/Eigen/src/Geometry/AngleAxis.h @@ -169,8 +169,8 @@ template template EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const QuaternionBase& q) { - EIGEN_USING_STD_MATH(atan2) - EIGEN_USING_STD_MATH(abs) + EIGEN_USING_STD(atan2) + EIGEN_USING_STD(abs) Scalar n = q.vec().norm(); if(n::epsilon()) n = q.vec().stableNorm(); @@ -217,8 +217,8 @@ template typename AngleAxis::Matrix3 EIGEN_DEVICE_FUNC AngleAxis::toRotationMatrix(void) const { - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) Matrix3 res; Vector3 sin_axis = sin(m_angle) * m_axis; Scalar c = cos(m_angle); diff --git a/inst/include/Eigen/src/Geometry/EulerAngles.h b/inst/include/Eigen/src/Geometry/EulerAngles.h index c633268a..19b734ca 100644 --- a/inst/include/Eigen/src/Geometry/EulerAngles.h +++ b/inst/include/Eigen/src/Geometry/EulerAngles.h @@ -36,9 +36,9 @@ template EIGEN_DEVICE_FUNC inline Matrix::Scalar,3,1> MatrixBase::eulerAngles(Index a0, Index a1, Index a2) const { - EIGEN_USING_STD_MATH(atan2) - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(atan2) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) /* Implemented from Graphics Gems IV */ EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived,3,3) diff --git a/inst/include/Eigen/src/Geometry/Homogeneous.h b/inst/include/Eigen/src/Geometry/Homogeneous.h index 5f0da1a9..94083ac5 100644 --- a/inst/include/Eigen/src/Geometry/Homogeneous.h +++ b/inst/include/Eigen/src/Geometry/Homogeneous.h @@ -10,7 +10,7 @@ #ifndef EIGEN_HOMOGENEOUS_H #define EIGEN_HOMOGENEOUS_H -namespace Eigen { +namespace Eigen { /** \geometry_module \ingroup Geometry_Module * @@ -72,9 +72,11 @@ template class Homogeneous : m_matrix(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical ? 1 : 0); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); } - + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows() + (int(Direction)==Vertical ? 1 : 0); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); } + EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; } template @@ -262,8 +264,10 @@ struct homogeneous_left_product_impl,Lhs> m_rhs(rhs) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } template EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const { @@ -300,8 +304,8 @@ struct homogeneous_right_product_impl,Rhs> : m_lhs(lhs), m_rhs(rhs) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } template EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const { @@ -322,7 +326,7 @@ template struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; - typedef HomogeneousShape Shape; + typedef HomogeneousShape Shape; }; template<> struct AssignmentKind { typedef Dense2Dense Kind; }; @@ -414,7 +418,7 @@ struct product_evaluator, ProductTag, Homogeneous typedef typename helper::ConstantBlock ConstantBlock; typedef typename helper::Xpr RefactoredXpr; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base( xpr.lhs().nestedExpression() .lazyProduct( xpr.rhs().template topRows(xpr.lhs().nestedExpression().cols()) ) + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) ) @@ -467,7 +471,7 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename helper::ConstantBlock ConstantBlock; typedef typename helper::Xpr RefactoredXpr; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base( xpr.lhs().template leftCols(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() ) + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) ) diff --git a/inst/include/Eigen/src/Geometry/Hyperplane.h b/inst/include/Eigen/src/Geometry/Hyperplane.h index 05929b29..cebe0355 100644 --- a/inst/include/Eigen/src/Geometry/Hyperplane.h +++ b/inst/include/Eigen/src/Geometry/Hyperplane.h @@ -119,7 +119,7 @@ class Hyperplane * If the dimension of the ambient space is greater than 2, then there isn't uniqueness, * so an arbitrary choice is made. */ - // FIXME to be consitent with the rest this could be implemented as a static Through function ?? + // FIXME to be consistent with the rest this could be implemented as a static Through function ?? EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine& parametrized) { normal() = parametrized.direction().unitOrthogonal(); diff --git a/inst/include/Eigen/src/Geometry/OrthoMethods.h b/inst/include/Eigen/src/Geometry/OrthoMethods.h index a035e631..524aebe1 100644 --- a/inst/include/Eigen/src/Geometry/OrthoMethods.h +++ b/inst/include/Eigen/src/Geometry/OrthoMethods.h @@ -27,9 +27,10 @@ namespace Eigen { template template #ifndef EIGEN_PARSED_BY_DOXYGEN -EIGEN_DEVICE_FUNC inline typename MatrixBase::template cross_product_return_type::type +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename MatrixBase::template cross_product_return_type::type #else -inline typename MatrixBase::PlainObject +typename MatrixBase::PlainObject #endif MatrixBase::cross(const MatrixBase& other) const { diff --git a/inst/include/Eigen/src/Geometry/ParametrizedLine.h b/inst/include/Eigen/src/Geometry/ParametrizedLine.h index 1e985d8c..584f5008 100644 --- a/inst/include/Eigen/src/Geometry/ParametrizedLine.h +++ b/inst/include/Eigen/src/Geometry/ParametrizedLine.h @@ -87,7 +87,7 @@ class ParametrizedLine /** \returns the distance of a point \a p to its projection onto the line \c *this. * \sa squaredDistance() */ - EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD_MATH(sqrt) return sqrt(squaredDistance(p)); } + EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD(sqrt) return sqrt(squaredDistance(p)); } /** \returns the projection of a point \a p onto the line \c *this. */ EIGEN_DEVICE_FUNC VectorType projection(const VectorType& p) const @@ -104,7 +104,44 @@ class ParametrizedLine template EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const; - /** \returns \c *this with scalar type casted to \a NewScalarType + /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this. + * + * \param mat the Dim x Dim transformation matrix + * \param traits specifies whether the matrix \a mat represents an #Isometry + * or a more generic #Affine transformation. The default is #Affine. + */ + template + EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const MatrixBase& mat, TransformTraits traits = Affine) + { + if (traits==Affine) + direction() = (mat * direction()).normalized(); + else if (traits==Isometry) + direction() = mat * direction(); + else + { + eigen_assert(0 && "invalid traits value in ParametrizedLine::transform()"); + } + origin() = mat * origin(); + return *this; + } + + /** Applies the transformation \a t to \c *this and returns a reference to \c *this. + * + * \param t the transformation of dimension Dim + * \param traits specifies whether the transformation \a t represents an #Isometry + * or a more generic #Affine transformation. The default is #Affine. + * Other kind of transformations are not supported. + */ + template + EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const Transform& t, + TransformTraits traits = Affine) + { + transform(t.linear(), traits); + origin() += t.translation(); + return *this; + } + +/** \returns \c *this with scalar type casted to \a NewScalarType * * Note that if \a NewScalarType is equal to the current scalar type of \c *this * then this function smartly returns a const reference to \c *this. diff --git a/inst/include/Eigen/src/Geometry/Quaternion.h b/inst/include/Eigen/src/Geometry/Quaternion.h index b8182065..3259e592 100644 --- a/inst/include/Eigen/src/Geometry/Quaternion.h +++ b/inst/include/Eigen/src/Geometry/Quaternion.h @@ -141,7 +141,7 @@ class QuaternionBase : public RotationBase template EIGEN_DEVICE_FUNC Scalar angularDistance(const QuaternionBase& other) const; /** \returns an equivalent 3x3 rotation matrix */ - EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix() const; + EIGEN_DEVICE_FUNC inline Matrix3 toRotationMatrix() const; /** \returns the quaternion which transform \a a into \a b through a rotation */ template @@ -158,6 +158,22 @@ class QuaternionBase : public RotationBase template EIGEN_DEVICE_FUNC Quaternion slerp(const Scalar& t, const QuaternionBase& other) const; + /** \returns true if each coefficients of \c *this and \a other are all exactly equal. + * \warning When using floating point scalar values you probably should rather use a + * fuzzy comparison such as isApprox() + * \sa isApprox(), operator!= */ + template + EIGEN_DEVICE_FUNC inline bool operator==(const QuaternionBase& other) const + { return coeffs() == other.coeffs(); } + + /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. + * \warning When using floating point scalar values you probably should rather use a + * fuzzy comparison such as isApprox() + * \sa isApprox(), operator== */ + template + EIGEN_DEVICE_FUNC inline bool operator!=(const QuaternionBase& other) const + { return coeffs() != other.coeffs(); } + /** \returns \c true if \c *this is approximately equal to \a other, within the precision * determined by \a prec. * @@ -181,20 +197,27 @@ class QuaternionBase : public RotationBase #else template - EIGEN_DEVICE_FUNC inline + EIGEN_DEVICE_FUNC inline typename internal::enable_if::value,const Derived&>::type cast() const { return derived(); } template - EIGEN_DEVICE_FUNC inline + EIGEN_DEVICE_FUNC inline typename internal::enable_if::value,Quaternion >::type cast() const { return Quaternion(coeffs().template cast()); } #endif +#ifndef EIGEN_NO_IO + friend std::ostream& operator<<(std::ostream& s, const QuaternionBase& q) { + s << q.x() << "i + " << q.y() << "j + " << q.z() << "k" << " + " << q.w(); + return s; + } +#endif + #ifdef EIGEN_QUATERNIONBASE_PLUGIN # include EIGEN_QUATERNIONBASE_PLUGIN #endif @@ -294,6 +317,21 @@ class Quaternion : public QuaternionBase > EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion& other) { m_coeffs = other.coeffs().template cast(); } +#if EIGEN_HAS_RVALUE_REFERENCES + // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator. + /** Default move constructor */ + EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) + : m_coeffs(std::move(other.coeffs())) + {} + + /** Default move assignment operator */ + EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) + { + m_coeffs = std::move(other.coeffs()); + return *this; + } +#endif + EIGEN_DEVICE_FUNC static Quaternion UnitRandom(); template @@ -522,8 +560,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase::operator template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase::operator=(const AngleAxisType& aa) { - EIGEN_USING_STD_MATH(cos) - EIGEN_USING_STD_MATH(sin) + EIGEN_USING_STD(cos) + EIGEN_USING_STD(sin) Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings this->w() = cos(ha); this->vec() = sin(ha) * aa.axis(); @@ -599,7 +637,7 @@ template template EIGEN_DEVICE_FUNC inline Derived& QuaternionBase::setFromTwoVectors(const MatrixBase& a, const MatrixBase& b) { - EIGEN_USING_STD_MATH(sqrt) + EIGEN_USING_STD(sqrt) Vector3 v0 = a.normalized(); Vector3 v1 = b.normalized(); Scalar c = v1.dot(v0); @@ -640,13 +678,13 @@ EIGEN_DEVICE_FUNC inline Derived& QuaternionBase::setFromTwoVectors(con template EIGEN_DEVICE_FUNC Quaternion Quaternion::UnitRandom() { - EIGEN_USING_STD_MATH(sqrt) - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(sqrt) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) const Scalar u1 = internal::random(0, 1), u2 = internal::random(0, 2*EIGEN_PI), u3 = internal::random(0, 2*EIGEN_PI); - const Scalar a = sqrt(1 - u1), + const Scalar a = sqrt(Scalar(1) - u1), b = sqrt(u1); return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3)); } @@ -725,7 +763,7 @@ template EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar QuaternionBase::angularDistance(const QuaternionBase& other) const { - EIGEN_USING_STD_MATH(atan2) + EIGEN_USING_STD(atan2) Quaternion d = (*this) * other.conjugate(); return Scalar(2) * atan2( d.vec().norm(), numext::abs(d.w()) ); } @@ -743,8 +781,8 @@ template EIGEN_DEVICE_FUNC Quaternion::Scalar> QuaternionBase::slerp(const Scalar& t, const QuaternionBase& other) const { - EIGEN_USING_STD_MATH(acos) - EIGEN_USING_STD_MATH(sin) + EIGEN_USING_STD(acos) + EIGEN_USING_STD(sin) const Scalar one = Scalar(1) - NumTraits::epsilon(); Scalar d = this->dot(other); Scalar absD = numext::abs(d); @@ -781,7 +819,7 @@ struct quaternionbase_assign_impl template EIGEN_DEVICE_FUNC static inline void run(QuaternionBase& q, const Other& a_mat) { const typename internal::nested_eval::type mat(a_mat); - EIGEN_USING_STD_MATH(sqrt) + EIGEN_USING_STD(sqrt) // This algorithm comes from "Quaternion Calculus and Fast Animation", // Ken Shoemake, 1987 SIGGRAPH course notes Scalar t = mat.trace(); diff --git a/inst/include/Eigen/src/Geometry/Rotation2D.h b/inst/include/Eigen/src/Geometry/Rotation2D.h index 884b7d0e..d0bd5756 100644 --- a/inst/include/Eigen/src/Geometry/Rotation2D.h +++ b/inst/include/Eigen/src/Geometry/Rotation2D.h @@ -175,7 +175,7 @@ template template EIGEN_DEVICE_FUNC Rotation2D& Rotation2D::fromRotationMatrix(const MatrixBase& mat) { - EIGEN_USING_STD_MATH(atan2) + EIGEN_USING_STD(atan2) EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE) m_angle = atan2(mat.coeff(1,0), mat.coeff(0,0)); return *this; @@ -187,8 +187,8 @@ template typename Rotation2D::Matrix2 EIGEN_DEVICE_FUNC Rotation2D::toRotationMatrix(void) const { - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) Scalar sinA = sin(m_angle); Scalar cosA = cos(m_angle); return (Matrix2() << cosA, -sinA, sinA, cosA).finished(); diff --git a/inst/include/Eigen/src/Geometry/Scaling.h b/inst/include/Eigen/src/Geometry/Scaling.h index 33eabd81..d352f1f2 100644 --- a/inst/include/Eigen/src/Geometry/Scaling.h +++ b/inst/include/Eigen/src/Geometry/Scaling.h @@ -29,6 +29,22 @@ namespace Eigen { * * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform */ + +namespace internal +{ + // This helper helps nvcc+MSVC to properly parse this file. + // See bug 1412. + template + struct uniformscaling_times_affine_returntype + { + enum + { + NewMode = int(Mode) == int(Isometry) ? Affine : Mode + }; + typedef Transform type; + }; +} + template class UniformScaling { @@ -60,9 +76,11 @@ class UniformScaling /** Concatenates a uniform scaling and an affine transformation */ template - inline Transform operator* (const Transform& t) const + inline typename + internal::uniformscaling_times_affine_returntype::type + operator* (const Transform& t) const { - Transform res = t; + typename internal::uniformscaling_times_affine_returntype::type res = t; res.prescale(factor()); return res; } @@ -70,7 +88,7 @@ class UniformScaling /** Concatenates a uniform scaling and a linear transformation matrix */ // TODO returns an expression template - inline typename internal::plain_matrix_type::type operator* (const MatrixBase& other) const + inline typename Eigen::internal::plain_matrix_type::type operator* (const MatrixBase& other) const { return other * m_factor; } template @@ -110,7 +128,7 @@ class UniformScaling /** Concatenates a linear transformation matrix and a uniform scaling * \relates UniformScaling */ -// NOTE this operator is defiend in MatrixBase and not as a friend function +// NOTE this operator is defined in MatrixBase and not as a friend function // of UniformScaling to fix an internal crash of Intel's ICC template EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product) diff --git a/inst/include/Eigen/src/Geometry/Transform.h b/inst/include/Eigen/src/Geometry/Transform.h index c21d9e55..52b8c2a4 100644 --- a/inst/include/Eigen/src/Geometry/Transform.h +++ b/inst/include/Eigen/src/Geometry/Transform.h @@ -12,7 +12,7 @@ #ifndef EIGEN_TRANSFORM_H #define EIGEN_TRANSFORM_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -47,7 +47,7 @@ struct transform_left_product_impl; template< typename Lhs, typename Rhs, - bool AnyProjective = + bool AnyProjective = transform_traits::IsProjective || transform_traits::IsProjective> struct transform_transform_product_impl; @@ -97,6 +97,9 @@ template struct transform_make_affine; * - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix. * - #Projective: the transformation is stored as a (Dim+1)^2 matrix * without any assumption. + * - #Isometry: same as #Affine with the additional assumption that + * the linear part represents a rotation. This assumption is exploited + * to speed up some functions such as inverse() and rotation(). * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor. * These Options are passed directly to the underlying matrix type. * @@ -115,7 +118,7 @@ template struct transform_make_affine; * \end{array} \right) \f$ * * Note that for a projective transformation the last row can be anything, - * and then the interpretation of different parts might be sightly different. + * and then the interpretation of different parts might be slightly different. * * However, unlike a plain matrix, the Transform class provides many features * simplifying both its assembly and usage. In particular, it can be composed @@ -220,9 +223,9 @@ class Transform /** type of the matrix used to represent the linear part of the transformation */ typedef Matrix LinearMatrixType; /** type of read/write reference to the linear part of the transformation */ - typedef Block LinearPart; + typedef Block LinearPart; /** type of read reference to the linear part of the transformation */ - typedef const Block ConstLinearPart; + typedef const Block ConstLinearPart; /** type of read/write reference to the affine part of the transformation */ typedef typename internal::conditional::Flags & RowMajorBit)> ConstTranslationPart; /** corresponding translation type */ typedef Translation TranslationType; - + // this intermediate enum is needed to avoid an ICE with gcc 3.4 and 4.0 enum { TransformTimeDiagonalMode = ((Mode==int(Isometry))?Affine:int(Mode)) }; /** The return type of the product between a diagonal matrix and a transform */ @@ -259,12 +262,6 @@ class Transform internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix); } - EIGEN_DEVICE_FUNC inline Transform(const Transform& other) - { - check_template_params(); - m_matrix = other.m_matrix; - } - EIGEN_DEVICE_FUNC inline explicit Transform(const TranslationType& t) { check_template_params(); @@ -282,9 +279,6 @@ class Transform *this = r; } - EIGEN_DEVICE_FUNC inline Transform& operator=(const Transform& other) - { m_matrix = other.m_matrix; return *this; } - typedef internal::transform_take_affine_part take_affine_part; /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */ @@ -308,7 +302,7 @@ class Transform internal::transform_construct_from_matrix::run(this, other.derived()); return *this; } - + template EIGEN_DEVICE_FUNC inline Transform(const Transform& other) { @@ -335,7 +329,7 @@ class Transform OtherModeIsAffineCompact = OtherMode == int(AffineCompact) }; - if(ModeIsAffineCompact == OtherModeIsAffineCompact) + if(EIGEN_CONST_CONDITIONAL(ModeIsAffineCompact == OtherModeIsAffineCompact)) { // We need the block expression because the code is compiled for all // combinations of transformations and will trigger a compile time error @@ -343,7 +337,7 @@ class Transform m_matrix.template block(0,0) = other.matrix().template block(0,0); makeAffine(); } - else if(OtherModeIsAffineCompact) + else if(EIGEN_CONST_CONDITIONAL(OtherModeIsAffineCompact)) { typedef typename Transform::MatrixType OtherMatrixType; internal::transform_construct_from_matrix::run(this, other.matrix()); @@ -380,9 +374,9 @@ class Transform inline Transform& operator=(const QTransform& other); inline QTransform toQTransform(void) const; #endif - - EIGEN_DEVICE_FUNC Index rows() const { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); } - EIGEN_DEVICE_FUNC Index cols() const { return m_matrix.cols(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** shortcut for m_matrix(row,col); * \sa MatrixBase::operator(Index,Index) const */ @@ -456,7 +450,7 @@ class Transform /** \returns The product expression of a transform \a a times a diagonal matrix \a b * * The rhs diagonal matrix is interpreted as an affine scaling transformation. The - * product results in a Transform of the same type (mode) as the lhs only if the lhs + * product results in a Transform of the same type (mode) as the lhs only if the lhs * mode is no isometry. In that case, the returned transform is an affinity. */ template @@ -471,7 +465,7 @@ class Transform /** \returns The product expression of a diagonal matrix \a a times a transform \a b * * The lhs diagonal matrix is interpreted as an affine scaling transformation. The - * product results in a Transform of the same type (mode) as the lhs only if the lhs + * product results in a Transform of the same type (mode) as the lhs only if the lhs * mode is no isometry. In that case, the returned transform is an affinity. */ template @@ -481,7 +475,7 @@ class Transform TransformTimeDiagonalReturnType res; res.linear().noalias() = a*b.linear(); res.translation().noalias() = a*b.translation(); - if (Mode!=int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode!=int(AffineCompact))) res.matrix().row(Dim) = b.matrix().row(Dim); return res; } @@ -494,7 +488,7 @@ class Transform { return internal::transform_transform_product_impl::run(*this,other); } - + #if EIGEN_COMP_ICC private: // this intermediate structure permits to workaround a bug in ICC 11: @@ -503,13 +497,13 @@ class Transform // (the meaning of a name may have changed since the template declaration -- the type of the template is: // "Eigen::internal::transform_transform_product_impl, // Eigen::Transform, >::ResultType (const Eigen::Transform &) const") - // + // template struct icc_11_workaround { typedef internal::transform_transform_product_impl > ProductType; typedef typename ProductType::ResultType ResultType; }; - + public: /** Concatenates two different transformations */ template @@ -542,7 +536,7 @@ class Transform } template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC inline Transform& scale(const MatrixBase &other); template @@ -572,18 +566,18 @@ class Transform EIGEN_DEVICE_FUNC Transform& preshear(const Scalar& sx, const Scalar& sy); EIGEN_DEVICE_FUNC inline Transform& operator=(const TranslationType& t); - + EIGEN_DEVICE_FUNC inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); } - + EIGEN_DEVICE_FUNC inline Transform operator*(const TranslationType& t) const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC inline Transform& operator=(const UniformScaling& t); - + EIGEN_DEVICE_FUNC inline Transform& operator*=(const UniformScaling& s) { return scale(s.factor()); } - + EIGEN_DEVICE_FUNC inline TransformTimeDiagonalReturnType operator*(const UniformScaling& s) const { @@ -602,7 +596,9 @@ class Transform template EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase& r) const; - EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const; + typedef typename internal::conditional::type RotationReturnType; + EIGEN_DEVICE_FUNC RotationReturnType rotation() const; + template EIGEN_DEVICE_FUNC void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const; @@ -684,7 +680,7 @@ class Transform #ifdef EIGEN_TRANSFORM_PLUGIN #include EIGEN_TRANSFORM_PLUGIN #endif - + protected: #ifndef EIGEN_PARSED_BY_DOXYGEN EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void check_template_params() @@ -755,7 +751,7 @@ template Transform& Transform::operator=(const QMatrix& other) { EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (Mode == int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(); else @@ -801,7 +797,7 @@ Transform& Transform::operator { check_template_params(); EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (Mode == int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(); else @@ -819,7 +815,7 @@ template QTransform Transform::toQTransform(void) const { EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (Mode == int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(0,2), m_matrix.coeff(1,2)); @@ -912,7 +908,7 @@ EIGEN_DEVICE_FUNC Transform& Transform::pretranslate(const MatrixBase &other) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim)) - if(int(Mode)==int(Projective)) + if(EIGEN_CONST_CONDITIONAL(int(Mode)==int(Projective))) affine() += other * m_matrix.row(Dim); else translation() += other; @@ -1046,20 +1042,43 @@ EIGEN_DEVICE_FUNC inline Transform Transform struct transform_rotation_impl { + template + EIGEN_DEVICE_FUNC static inline + const typename TransformType::LinearMatrixType run(const TransformType& t) + { + typedef typename TransformType::LinearMatrixType LinearMatrixType; + LinearMatrixType result; + t.computeRotationScaling(&result, (LinearMatrixType*)0); + return result; + } +}; +template<> struct transform_rotation_impl { + template + EIGEN_DEVICE_FUNC static inline + typename TransformType::ConstLinearPart run(const TransformType& t) + { + return t.linear(); + } +}; +} /** \returns the rotation part of the transformation * + * If Mode==Isometry, then this method is an alias for linear(), + * otherwise it calls computeRotationScaling() to extract the rotation + * through a SVD decomposition. * * \svd_module * * \sa computeRotationScaling(), computeScalingRotation(), class SVD */ template -EIGEN_DEVICE_FUNC const typename Transform::LinearMatrixType +EIGEN_DEVICE_FUNC +typename Transform::RotationReturnType Transform::rotation() const { - LinearMatrixType result; - computeRotationScaling(&result, (LinearMatrixType*)0); - return result; + return internal::transform_rotation_impl::run(*this); } @@ -1078,17 +1097,18 @@ template template EIGEN_DEVICE_FUNC void Transform::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const { + // Note that JacobiSVD is faster than BDCSVD for small matrices. JacobiSVD svd(linear(), ComputeFullU | ComputeFullV); - Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1 + Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1 VectorType sv(svd.singularValues()); - sv.coeffRef(0) *= x; - if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint()); + sv.coeffRef(Dim-1) *= x; + if(scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint(); if(rotation) { LinearMatrixType m(svd.matrixU()); - m.col(0) /= x; - rotation->lazyAssign(m * svd.matrixV().adjoint()); + m.col(Dim-1) *= x; + *rotation = m * svd.matrixV().adjoint(); } } @@ -1107,17 +1127,18 @@ template template EIGEN_DEVICE_FUNC void Transform::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const { + // Note that JacobiSVD is faster than BDCSVD for small matrices. JacobiSVD svd(linear(), ComputeFullU | ComputeFullV); - Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1 + Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1 VectorType sv(svd.singularValues()); - sv.coeffRef(0) *= x; - if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint()); + sv.coeffRef(Dim-1) *= x; + if(scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint(); if(rotation) { LinearMatrixType m(svd.matrixU()); - m.col(0) /= x; - rotation->lazyAssign(m * svd.matrixV().adjoint()); + m.col(Dim-1) *= x; + *rotation = m * svd.matrixV().adjoint(); } } @@ -1156,7 +1177,7 @@ struct transform_make_affine { template EIGEN_DEVICE_FUNC static void run(MatrixType &) { } }; - + // selector needed to avoid taking the inverse of a 3x4 matrix template struct projective_transform_inverse @@ -1297,8 +1318,8 @@ struct transform_construct_from_matrix struct transform_product_result { - enum - { + enum + { Mode = (LhsMode == (int)Projective || RhsMode == (int)Projective ) ? Projective : (LhsMode == (int)Affine || RhsMode == (int)Affine ) ? Affine : @@ -1312,7 +1333,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols> { typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { return T.matrix() * other; } @@ -1321,8 +1342,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols> template< typename TransformType, typename MatrixType, int RhsCols> struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> { - enum { - Dim = TransformType::Dim, + enum { + Dim = TransformType::Dim, HDim = TransformType::HDim, OtherRows = MatrixType::RowsAtCompileTime, OtherCols = MatrixType::ColsAtCompileTime @@ -1330,7 +1351,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { EIGEN_STATIC_ASSERT(OtherRows==HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); @@ -1339,7 +1360,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> ResultType res(other.rows(),other.cols()); TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() = T.affine() * other; res.row(OtherRows-1) = other.row(OtherRows-1); - + return res; } }; @@ -1347,8 +1368,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> template< typename TransformType, typename MatrixType, int RhsCols> struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols> { - enum { - Dim = TransformType::Dim, + enum { + Dim = TransformType::Dim, HDim = TransformType::HDim, OtherRows = MatrixType::RowsAtCompileTime, OtherCols = MatrixType::ColsAtCompileTime @@ -1356,7 +1377,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols> typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); @@ -1381,7 +1402,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); diff --git a/inst/include/Eigen/src/Geometry/Translation.h b/inst/include/Eigen/src/Geometry/Translation.h index 0e99ce68..8c229012 100644 --- a/inst/include/Eigen/src/Geometry/Translation.h +++ b/inst/include/Eigen/src/Geometry/Translation.h @@ -70,18 +70,18 @@ class Translation /** Constructs and initialize the translation transformation from a vector of translation coefficients */ EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {} - /** \brief Retruns the x-translation by value. **/ + /** \brief Returns the x-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); } - /** \brief Retruns the y-translation by value. **/ + /** \brief Returns the y-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); } - /** \brief Retruns the z-translation by value. **/ + /** \brief Returns the z-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); } - /** \brief Retruns the x-translation as a reference. **/ + /** \brief Returns the x-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); } - /** \brief Retruns the y-translation as a reference. **/ + /** \brief Returns the y-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); } - /** \brief Retruns the z-translation as a reference. **/ + /** \brief Returns the z-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); } EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; } diff --git a/inst/include/Eigen/src/Geometry/arch/Geometry_SIMD.h b/inst/include/Eigen/src/Geometry/arch/Geometry_SIMD.h new file mode 100644 index 00000000..9af6a9af --- /dev/null +++ b/inst/include/Eigen/src/Geometry/arch/Geometry_SIMD.h @@ -0,0 +1,168 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Rohit Garg +// Copyright (C) 2009-2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GEOMETRY_SIMD_H +#define EIGEN_GEOMETRY_SIMD_H + +namespace Eigen { + +namespace internal { + +template +struct quat_product +{ + enum { + AAlignment = traits::Alignment, + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) + { + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); + Quaternion res; + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {0.f, 0.f, 0.f, neg_zero}; + const Packet4f mask = ploadu(arr); + Packet4f a = ae.template packet(0); + Packet4f b = be.template packet(0); + Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); + pstoret( + &res.x(), + padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), + pmul(vec4f_swizzle1(a,2,0,1,0), + vec4f_swizzle1(b,1,2,0,0))), + pxor(mask,padd(s1,s2)))); + + return res; + } +}; + +template +struct quat_conj +{ + enum { + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& q) + { + evaluator qe(q.coeffs()); + Quaternion res; + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {neg_zero, neg_zero, neg_zero,0.f}; + const Packet4f mask = ploadu(arr); + pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); + return res; + } +}; + + +template +struct cross3_impl +{ + enum { + ResAlignment = traits::type>::Alignment + }; + static inline typename plain_matrix_type::type + run(const VectorLhs& lhs, const VectorRhs& rhs) + { + evaluator lhs_eval(lhs); + evaluator rhs_eval(rhs); + Packet4f a = lhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f b = rhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); + Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); + typename plain_matrix_type::type res; + pstoret(&res.x(),psub(mul1,mul2)); + return res; + } +}; + + + +#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64) + +template +struct quat_product +{ + enum { + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) + { + Quaternion res; + + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); + + const double* a = _a.coeffs().data(); + Packet2d b_xy = be.template packet(0); + Packet2d b_zw = be.template packet(2); + Packet2d a_xx = pset1(a[0]); + Packet2d a_yy = pset1(a[1]); + Packet2d a_zz = pset1(a[2]); + Packet2d a_ww = pset1(a[3]); + + // two temporaries: + Packet2d t1, t2; + + /* + * t1 = ww*xy + yy*zw + * t2 = zz*xy - xx*zw + * res.xy = t1 +/- swap(t2) + */ + t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); + t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); + pstoret(&res.x(), paddsub(t1, preverse(t2))); + + /* + * t1 = ww*zw - yy*xy + * t2 = zz*zw + xx*xy + * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) + */ + t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); + t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); + pstoret(&res.z(), preverse(paddsub(preverse(t1), t2))); + + return res; +} +}; + +template +struct quat_conj +{ + enum { + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& q) + { + evaluator qe(q.coeffs()); + Quaternion res; + const double neg_zero = numext::bit_cast(0x8000000000000000ull); + const double arr1[2] = {neg_zero, neg_zero}; + const double arr2[2] = {neg_zero, 0.0}; + const Packet2d mask0 = ploadu(arr1); + const Packet2d mask2 = ploadu(arr2); + pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); + pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); + return res; + } +}; + +#endif // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64 + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GEOMETRY_SIMD_H diff --git a/inst/include/Eigen/src/Geometry/arch/Geometry_SSE.h b/inst/include/Eigen/src/Geometry/arch/Geometry_SSE.h deleted file mode 100644 index f68cab58..00000000 --- a/inst/include/Eigen/src/Geometry/arch/Geometry_SSE.h +++ /dev/null @@ -1,161 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2009 Rohit Garg -// Copyright (C) 2009-2010 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GEOMETRY_SSE_H -#define EIGEN_GEOMETRY_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct quat_product -{ - enum { - AAlignment = traits::Alignment, - BAlignment = traits::Alignment, - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - Quaternion res; - const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet(0); - __m128 b = _b.coeffs().template packet(0); - __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); - pstoret( - &res.x(), - _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), - _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), - vec4f_swizzle1(b,1,2,0,0))), - _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); - - return res; - } -}; - -template -struct quat_conj -{ - enum { - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& q) - { - Quaternion res; - const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstoret(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet::Alignment>(0))); - return res; - } -}; - - -template -struct cross3_impl -{ - enum { - ResAlignment = traits::type>::Alignment - }; - static inline typename plain_matrix_type::type - run(const VectorLhs& lhs, const VectorRhs& rhs) - { - __m128 a = lhs.template packet::Alignment>(0); - __m128 b = rhs.template packet::Alignment>(0); - __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); - __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); - typename plain_matrix_type::type res; - pstoret(&res.x(),_mm_sub_ps(mul1,mul2)); - return res; - } -}; - - - - -template -struct quat_product -{ - enum { - BAlignment = traits::Alignment, - ResAlignment = traits >::Alignment - }; - - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - - Quaternion res; - - const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); - Packet2d a_xx = pset1(a[0]); - Packet2d a_yy = pset1(a[1]); - Packet2d a_zz = pset1(a[2]); - Packet2d a_ww = pset1(a[3]); - - // two temporaries: - Packet2d t1, t2; - - /* - * t1 = ww*xy + yy*zw - * t2 = zz*xy - xx*zw - * res.xy = t1 +/- swap(t2) - */ - t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); - t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret(&res.x(), _mm_addsub_pd(t1, preverse(t2))); -#else - pstoret(&res.x(), padd(t1, pxor(mask,preverse(t2)))); -#endif - - /* - * t1 = ww*zw - yy*xy - * t2 = zz*zw + xx*xy - * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) - */ - t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); - t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); -#else - pstoret(&res.z(), psub(t1, pxor(mask,preverse(t2)))); -#endif - - return res; -} -}; - -template -struct quat_conj -{ - enum { - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& q) - { - Quaternion res; - const __m128d mask0 = _mm_setr_pd(-0.,-0.); - const __m128d mask2 = _mm_setr_pd(-0.,0.); - pstoret(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet::Alignment>(0))); - pstoret(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet::Alignment>(2))); - return res; - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_GEOMETRY_SSE_H diff --git a/inst/include/Eigen/src/Householder/BlockHouseholder.h b/inst/include/Eigen/src/Householder/BlockHouseholder.h index 01a7ed18..39ce1c2a 100644 --- a/inst/include/Eigen/src/Householder/BlockHouseholder.h +++ b/inst/include/Eigen/src/Householder/BlockHouseholder.h @@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView(); - // FIXME add .noalias() once the triangular product can work inplace - triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + // FIXME use the following line with .noalias() once the triangular product can work inplace + // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + for(Index j=nbVecs-1; j>i; --j) + { + typename TriangularFactorType::Scalar z = triFactor(i,j); + triFactor(i,j) = z * triFactor(j,j); + if(nbVecs-j-1>0) + triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1); + } } triFactor(i,i) = hCoeffs(i); diff --git a/inst/include/Eigen/src/Householder/Householder.h b/inst/include/Eigen/src/Householder/Householder.h index 80de2c30..5bc037f0 100644 --- a/inst/include/Eigen/src/Householder/Householder.h +++ b/inst/include/Eigen/src/Householder/Householder.h @@ -39,6 +39,7 @@ template struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock::ret> essentialPart(derived(), 1, size()-1); @@ -62,6 +63,7 @@ void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -103,13 +105,14 @@ void MatrixBase::makeHouseholder( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->cols() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheRight() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -140,13 +143,14 @@ void MatrixBase::applyHouseholderOnTheLeft( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->rows() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheLeft() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, @@ -160,10 +164,10 @@ void MatrixBase::applyHouseholderOnTheRight( { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); - tmp.noalias() = right * essential.conjugate(); + tmp.noalias() = right * essential; tmp += this->col(0); this->col(0) -= tau * tmp; - right.noalias() -= tau * tmp * essential.transpose(); + right.noalias() -= tau * tmp * essential.adjoint(); } } diff --git a/inst/include/Eigen/src/Householder/HouseholderSequence.h b/inst/include/Eigen/src/Householder/HouseholderSequence.h index 3ce0a693..022f6c3d 100644 --- a/inst/include/Eigen/src/Householder/HouseholderSequence.h +++ b/inst/include/Eigen/src/Householder/HouseholderSequence.h @@ -11,7 +11,7 @@ #ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H #define EIGEN_HOUSEHOLDER_SEQUENCE_H -namespace Eigen { +namespace Eigen { /** \ingroup Householder_Module * \householder_module @@ -34,8 +34,8 @@ namespace Eigen { * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$ * v_i \f$ is a vector of the form - * \f[ - * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. + * \f[ + * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. * \f] * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector. * @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block EssentialVectorType; typedef HouseholderSequence HouseholderSequenceType; - static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block(h.m_vectors, start, k, h.rows()-start, 1); @@ -120,7 +120,7 @@ template class HouseholderS : public EigenBase > { typedef typename internal::hseq_side_dependent_impl::EssentialVectorType EssentialVectorType; - + public: enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, @@ -140,6 +140,28 @@ template class HouseholderS Side > ConjugateReturnType; + typedef HouseholderSequence< + VectorsType, + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + CoeffsType>::type, + Side + > AdjointReturnType; + + typedef HouseholderSequence< + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + VectorsType>::type, + CoeffsType, + Side + > TransposeReturnType; + + typedef HouseholderSequence< + typename internal::add_const::type, + typename internal::add_const::type, + Side + > ConstHouseholderSequence; + /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -157,33 +179,37 @@ template class HouseholderS * * \sa setLength(), setShift() */ + EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) - : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()), + : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) { } /** \brief Copy constructor. */ + EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), - m_trans(other.m_trans), + m_reverse(other.m_reverse), m_length(other.m_length), m_shift(other.m_shift) { } /** \brief Number of rows of transformation viewed as a matrix. - * \returns Number of rows + * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ - Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ - Index cols() const { return rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return rows(); } /** \brief Essential part of a Householder vector. * \param[in] k Index of Householder reflection @@ -191,14 +217,15 @@ template class HouseholderS * * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector - * \f[ - * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. + * \f[ + * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. * \f] * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v * passed to the constructor. * * \sa setShift(), shift() */ + EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -206,31 +233,51 @@ template class HouseholderS } /** \brief %Transpose of the Householder sequence. */ - HouseholderSequence transpose() const + TransposeReturnType transpose() const { - return HouseholderSequence(*this).setTrans(!m_trans); + return TransposeReturnType(m_vectors.conjugate(), m_coeffs) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Complex conjugate of the Householder sequence. */ ConjugateReturnType conjugate() const { return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate()) - .setTrans(m_trans) + .setReverseFlag(m_reverse) .setLength(m_length) .setShift(m_shift); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_vectors.template conjugateIf(), m_coeffs.template conjugateIf()); + } + /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ - ConjugateReturnType adjoint() const + AdjointReturnType adjoint() const { - return conjugate().setTrans(!m_trans); + return AdjointReturnType(m_vectors, m_coeffs.conjugate()) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Inverse of the Householder sequence (equals the adjoint). */ - ConjugateReturnType inverse() const { return adjoint(); } + AdjointReturnType inverse() const { return adjoint(); } /** \internal */ - template inline void evalTo(DestType& dst) const + template + inline EIGEN_DEVICE_FUNC + void evalTo(DestType& dst) const { Matrix workspace(rows()); @@ -239,6 +286,7 @@ template class HouseholderS /** \internal */ template + EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -251,7 +299,7 @@ template class HouseholderS for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else @@ -265,18 +313,26 @@ template class HouseholderS for(Index k = 0; kBlockSize) + { + dst.setIdentity(rows(), rows()); + if(m_reverse) + applyThisOnTheLeft(dst,workspace,true); + else + applyThisOnTheLeft(dst,workspace,true); + } else { dst.setIdentity(rows(), rows()); for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data()); } } } @@ -295,42 +351,52 @@ template class HouseholderS workspace.resize(dst.rows()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? m_length-k-1 : k; + Index actual_k = m_reverse ? m_length-k-1 : k; dst.rightCols(rows()-m_shift-actual_k) .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } /** \internal */ - template inline void applyThisOnTheLeft(Dest& dst) const + template inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const { Matrix workspace; - applyThisOnTheLeft(dst, workspace); + applyThisOnTheLeft(dst, workspace, inputIsIdentity); } /** \internal */ template - inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const + inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const { - const Index BlockSize = 48; + if(inputIsIdentity && m_reverse) + inputIsIdentity = false; // if the entries are large enough, then apply the reflectors by block if(m_length>=BlockSize && dst.cols()>1) { - for(Index i = 0; i < m_length; i+=BlockSize) + // Make sure we have at least 2 useful blocks, otherwise it is point-less: + Index blockSize = m_length::type,Dynamic,Dynamic> SubVectorsType; SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start, Side==OnTheRight ? start : k, Side==OnTheRight ? bs : m_vectors.rows()-start, Side==OnTheRight ? m_vectors.cols()-start : bs); typename internal::conditional, SubVectorsType&>::type sub_vecs(sub_vecs1); - Block sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols()); - apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans); + + Index dstStart = dst.rows()-rows()+m_shift+k; + Index dstRows = rows()-m_shift-k; + Block sub_dst(dst, + dstStart, + inputIsIdentity ? dstStart : 0, + dstRows, + inputIsIdentity ? dstRows : dst.cols()); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); } } else @@ -338,8 +404,9 @@ template class HouseholderS workspace.resize(dst.cols()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? k : m_length-k-1; - dst.bottomRows(rows()-m_shift-actual_k) + Index actual_k = m_reverse ? k : m_length-k-1; + Index dstStart = rows()-m_shift-actual_k; + dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols()) .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } @@ -357,7 +424,7 @@ template class HouseholderS { typename internal::matrix_type_times_scalar_type::Type res(other.template cast::ResultScalar>()); - applyThisOnTheLeft(res); + applyThisOnTheLeft(res, internal::is_identity::value && res.rows()==res.cols()); return res; } @@ -372,6 +439,7 @@ template class HouseholderS * * \sa length() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -389,13 +457,17 @@ template class HouseholderS * * \sa shift() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } + EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ + + EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ @@ -403,27 +475,30 @@ template class HouseholderS protected: - /** \brief Sets the transpose flag. - * \param [in] trans New value of the transpose flag. + /** \internal + * \brief Sets the reverse flag. + * \param [in] reverse New value of the reverse flag. * - * By default, the transpose flag is not set. If the transpose flag is set, then this object represents - * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * By default, the reverse flag is not set. If the reverse flag is set, then this object represents + * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$. * - * \sa trans() + * \sa reverseFlag(), transpose(), adjoint() */ - HouseholderSequence& setTrans(bool trans) + HouseholderSequence& setReverseFlag(bool reverse) { - m_trans = trans; + m_reverse = reverse; return *this; } - bool trans() const { return m_trans; } /**< \brief Returns the transpose flag. */ + bool reverseFlag() const { return m_reverse; } /**< \internal \brief Returns the reverse flag. */ typename VectorsType::Nested m_vectors; typename CoeffsType::Nested m_coeffs; - bool m_trans; + bool m_reverse; Index m_length; Index m_shift; + enum { BlockSize = 48 }; }; /** \brief Computes the product of a matrix with a Householder sequence. @@ -444,7 +519,7 @@ typename internal::matrix_type_times_scalar_type @@ -454,7 +529,7 @@ HouseholderSequence householderSequence(const VectorsTyp } /** \ingroup Householder_Module \householder_module - * \brief Convenience function for constructing a Householder sequence. + * \brief Convenience function for constructing a Householder sequence. * \returns A HouseholderSequence constructed from the specified arguments. * \details This function differs from householderSequence() in that the template argument \p OnTheSide of * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft. diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index f66c846e..a117fc15 100644 --- a/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -10,7 +10,7 @@ #ifndef EIGEN_BASIC_PRECONDITIONERS_H #define EIGEN_BASIC_PRECONDITIONERS_H -namespace Eigen { +namespace Eigen { /** \ingroup IterativeLinearSolvers_Module * \brief A preconditioner based on the digonal entries @@ -52,15 +52,15 @@ class DiagonalPreconditioner compute(mat); } - Index rows() const { return m_invdiag.size(); } - Index cols() const { return m_invdiag.size(); } - + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); } + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); } + template DiagonalPreconditioner& analyzePattern(const MatType& ) { return *this; } - + template DiagonalPreconditioner& factorize(const MatType& mat) { @@ -77,7 +77,7 @@ class DiagonalPreconditioner m_isInitialized = true; return *this; } - + template DiagonalPreconditioner& compute(const MatType& mat) { @@ -99,7 +99,7 @@ class DiagonalPreconditioner && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b"); return Solve(*this, b.derived()); } - + ComputationInfo info() { return Success; } protected: @@ -121,7 +121,7 @@ class DiagonalPreconditioner * \implsparsesolverconcept * * The diagonal entries are pre-inverted and stored into a dense vector. - * + * * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner */ template @@ -146,7 +146,7 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> { return *this; } - + template LeastSquareDiagonalPreconditioner& factorize(const MatType& mat) { @@ -178,13 +178,13 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> Base::m_isInitialized = true; return *this; } - + template LeastSquareDiagonalPreconditioner& compute(const MatType& mat) { return factorize(mat); } - + ComputationInfo info() { return Success; } protected: @@ -205,19 +205,19 @@ class IdentityPreconditioner template explicit IdentityPreconditioner(const MatrixType& ) {} - + template IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; } - + template IdentityPreconditioner& factorize(const MatrixType& ) { return *this; } template IdentityPreconditioner& compute(const MatrixType& ) { return *this; } - + template inline const Rhs& solve(const Rhs& b) const { return b; } - + ComputationInfo info() { return Success; } }; diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 454f4681..153acef6 100644 --- a/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -191,32 +191,16 @@ class BiCGSTAB : public IterativeSolverBase - void _solve_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const { - bool failed = false; - for(Index j=0; j - void _solve_impl(const MatrixBase& b, Dest& x) const - { - x.resize(this->rows(),b.cols()); - x.setZero(); - _solve_with_guess_impl(b,x); } protected: diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index f7ce4713..5d8c6b43 100644 --- a/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -51,7 +51,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, return; } const RealScalar considerAsZero = (std::numeric_limits::min)(); - RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero); + RealScalar threshold = numext::maxi(RealScalar(tol*tol*rhsNorm2),considerAsZero); RealScalar residualNorm2 = residual.squaredNorm(); if (residualNorm2 < threshold) { @@ -195,7 +195,7 @@ class ConjugateGradient : public IterativeSolverBase - void _solve_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const { typedef typename Base::MatrixWrapper MatrixWrapper; typedef typename Base::ActualMatrixType ActualMatrixType; @@ -211,31 +211,14 @@ class ConjugateGradient : public IterativeSolverBase::Type >::type SelfAdjointWrapper; + m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; - for(Index j=0; j - void _solve_impl(const MatrixBase& b, Dest& x) const - { - x.setZero(); - _solve_with_guess_impl(b.derived(),x); - } protected: diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index e45c272b..7803fd81 100644 --- a/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -14,8 +14,8 @@ #include #include -namespace Eigen { -/** +namespace Eigen { +/** * \brief Modified Incomplete Cholesky with dual threshold * * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with @@ -41,28 +41,22 @@ namespace Eigen { * the info() method, then you can either increase the initial shift, or better use another preconditioning technique. * */ -template -#else -NaturalOrdering -#endif -> +template > class IncompleteCholesky : public SparseSolverBase > { protected: typedef SparseSolverBase > Base; using Base::m_isInitialized; public: - typedef typename NumTraits::Real RealScalar; + typedef typename NumTraits::Real RealScalar; typedef _OrderingType OrderingType; typedef typename OrderingType::PermutationType PermutationType; - typedef typename PermutationType::StorageIndex StorageIndex; + typedef typename PermutationType::StorageIndex StorageIndex; typedef SparseMatrix FactorType; typedef Matrix VectorSx; typedef Matrix VectorRx; typedef Matrix VectorIx; - typedef std::vector > VectorList; + typedef std::vector > VectorList; enum { UpLo = _UpLo }; enum { ColsAtCompileTime = Dynamic, @@ -76,22 +70,22 @@ class IncompleteCholesky : public SparseSolverBase - IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false) + IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) { compute(matrix); } - + /** \returns number of rows of the factored matrix */ - Index rows() const { return m_L.rows(); } - + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); } + /** \returns number of columns of the factored matrix */ - Index cols() const { return m_L.cols(); } - + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); } + /** \brief Reports whether previous computation was successful. * @@ -106,19 +100,19 @@ class IncompleteCholesky : public SparseSolverBase void analyzePattern(const MatrixType& mat) { - OrderingType ord; + OrderingType ord; PermutationType pinv; - ord(mat.template selfadjointView(), pinv); + ord(mat.template selfadjointView(), pinv); if(pinv.size()>0) m_perm = pinv.inverse(); else m_perm.resize(0); m_L.resize(mat.rows(), mat.cols()); @@ -126,7 +120,7 @@ class IncompleteCholesky : public SparseSolverBase void factorize(const MatrixType& mat); - + /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat * * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods. @@ -149,7 +143,7 @@ class IncompleteCholesky : public SparseSolverBase void _solve_impl(const Rhs& b, Dest& x) const @@ -176,16 +170,16 @@ class IncompleteCholesky : public SparseSolverBase colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); -}; + inline void updateList(Ref colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); +}; // Based on the following paper: // C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with @@ -196,10 +190,10 @@ template void IncompleteCholesky::factorize(const _MatrixType& mat) { using std::sqrt; - eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); - + eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); + // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added - + // Apply the fill-reducing permutation computed in analyzePattern() if (m_perm.rows() == mat.rows() ) // To detect the null permutation { @@ -212,8 +206,8 @@ void IncompleteCholesky::factorize(const _MatrixType { m_L.template selfadjointView() = mat.template selfadjointView<_UpLo>(); } - - Index n = m_L.cols(); + + Index n = m_L.cols(); Index nnz = m_L.nonZeros(); Map vals(m_L.valuePtr(), nnz); //values Map rowIdx(m_L.innerIndexPtr(), nnz); //Row indices @@ -225,9 +219,9 @@ void IncompleteCholesky::factorize(const _MatrixType VectorIx col_pattern(n); col_pattern.fill(-1); StorageIndex col_nnz; - - - // Computes the scaling factors + + + // Computes the scaling factors m_scale.resize(n); m_scale.setZero(); for (Index j = 0; j < n; j++) @@ -237,7 +231,7 @@ void IncompleteCholesky::factorize(const _MatrixType if(rowIdx[k]!=j) m_scale(rowIdx[k]) += numext::abs2(vals(k)); } - + m_scale = m_scale.cwiseSqrt().cwiseSqrt(); for (Index j = 0; j < n; ++j) @@ -247,8 +241,8 @@ void IncompleteCholesky::factorize(const _MatrixType m_scale(j) = 1; // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster) - - // Scale and compute the shift for the matrix + + // Scale and compute the shift for the matrix RealScalar mindiag = NumTraits::highest(); for (Index j = 0; j < n; j++) { @@ -259,7 +253,7 @@ void IncompleteCholesky::factorize(const _MatrixType } FactorType L_save = m_L; - + RealScalar shift = 0; if(mindiag <= RealScalar(0.)) shift = m_initialShift - mindiag; @@ -381,7 +375,7 @@ inline void IncompleteCholesky::updateList(Ref::updateList(Ref= abs(row(ncut)) if incut + * abs(row(i)) <= abs(row(ncut)) if i>ncut * \param row The vector of values * \param ind The array of index for the elements in @p row * \param ncut The number of largest elements to keep - **/ + **/ template Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) { @@ -34,15 +34,15 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) Index mid; Index n = row.size(); /* length of the vector */ Index first, last ; - + ncut--; /* to fit the zero-based indices */ - first = 0; - last = n-1; + first = 0; + last = n-1; if (ncut < first || ncut > last ) return 0; - + do { - mid = first; - RealScalar abskey = abs(row(mid)); + mid = first; + RealScalar abskey = abs(row(mid)); for (Index j = first + 1; j <= last; j++) { if ( abs(row(j)) > abskey) { ++mid; @@ -53,12 +53,12 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) /* Interchange for the pivot element */ swap(row(mid), row(first)); swap(ind(mid), ind(first)); - + if (mid > ncut) last = mid - 1; - else if (mid < ncut ) first = mid + 1; + else if (mid < ncut ) first = mid + 1; } while (mid != ncut ); - - return 0; /* mid is equal to ncut */ + + return 0; /* mid is equal to ncut */ } }// end namespace internal @@ -71,23 +71,23 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) * * During the numerical factorization, two dropping rules are used : * 1) any element whose magnitude is less than some tolerance is dropped. - * This tolerance is obtained by multiplying the input tolerance @p droptol + * This tolerance is obtained by multiplying the input tolerance @p droptol * by the average magnitude of all the original elements in the current row. - * 2) After the elimination of the row, only the @p fill largest elements in - * the L part and the @p fill largest elements in the U part are kept - * (in addition to the diagonal element ). Note that @p fill is computed from - * the input parameter @p fillfactor which is used the ratio to control the fill_in + * 2) After the elimination of the row, only the @p fill largest elements in + * the L part and the @p fill largest elements in the U part are kept + * (in addition to the diagonal element ). Note that @p fill is computed from + * the input parameter @p fillfactor which is used the ratio to control the fill_in * relatively to the initial number of nonzero elements. - * + * * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements) - * and when @p fill=n/2 with @p droptol being different to zero. - * - * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, + * and when @p fill=n/2 with @p droptol being different to zero. + * + * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, * Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994. - * + * * NOTE : The following implementation is derived from the ILUT implementation - * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota - * released under the terms of the GNU LGPL: + * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota + * released under the terms of the GNU LGPL: * http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2. * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012: @@ -115,28 +115,28 @@ class IncompleteLUT : public SparseSolverBase::dummy_precision()), m_fillfactor(10), m_analysisIsOk(false), m_factorizationIsOk(false) {} - + template explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits::dummy_precision(), int fillfactor = 10) : m_droptol(droptol),m_fillfactor(fillfactor), m_analysisIsOk(false),m_factorizationIsOk(false) { eigen_assert(fillfactor != 0); - compute(mat); + compute(mat); } - - Index rows() const { return m_lu.rows(); } - - Index cols() const { return m_lu.cols(); } + + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -144,36 +144,36 @@ class IncompleteLUT : public SparseSolverBase void analyzePattern(const MatrixType& amat); - + template void factorize(const MatrixType& amat); - + /** * Compute an incomplete LU factorization with dual threshold on the matrix mat * No pivoting is done in this version - * + * **/ template IncompleteLUT& compute(const MatrixType& amat) { - analyzePattern(amat); + analyzePattern(amat); factorize(amat); return *this; } - void setDroptol(const RealScalar& droptol); - void setFillfactor(int fillfactor); - + void setDroptol(const RealScalar& droptol); + void setFillfactor(int fillfactor); + template void _solve_impl(const Rhs& b, Dest& x) const { x = m_Pinv * b; x = m_lu.template triangularView().solve(x); x = m_lu.template triangularView().solve(x); - x = m_P * x; + x = m_P * x; } protected: @@ -200,22 +200,22 @@ class IncompleteLUT : public SparseSolverBase void IncompleteLUT::setDroptol(const RealScalar& droptol) { - this->m_droptol = droptol; + this->m_droptol = droptol; } /** * Set control parameter fillfactor - * \param fillfactor This is used to compute the number @p fill_in of largest elements to keep on each row. - **/ + * \param fillfactor This is used to compute the number @p fill_in of largest elements to keep on each row. + **/ template void IncompleteLUT::setFillfactor(int fillfactor) { - this->m_fillfactor = fillfactor; + this->m_fillfactor = fillfactor; } template @@ -225,24 +225,15 @@ void IncompleteLUT::analyzePattern(const _MatrixType& amat) // Compute the Fill-reducing permutation // Since ILUT does not perform any numerical pivoting, // it is highly preferable to keep the diagonal through symmetric permutations. -#ifndef EIGEN_MPL2_ONLY // To this end, let's symmetrize the pattern and perform AMD on it. SparseMatrix mat1 = amat; SparseMatrix mat2 = amat.transpose(); // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice. - // on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered... + // on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred... SparseMatrix AtA = mat2 + mat1; AMDOrdering ordering; ordering(AtA,m_P); m_Pinv = m_P.inverse(); // cache the inverse permutation -#else - // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine. - SparseMatrix mat1 = amat; - COLAMDOrdering ordering; - ordering(mat1,m_Pinv); - m_P = m_Pinv.inverse(); -#endif - m_analysisIsOk = true; m_factorizationIsOk = false; m_isInitialized = true; diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 7c2326eb..28a0c510 100644 --- a/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -10,7 +10,7 @@ #ifndef EIGEN_ITERATIVE_SOLVER_BASE_H #define EIGEN_ITERATIVE_SOLVER_BASE_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -145,7 +145,7 @@ class IterativeSolverBase : public SparseSolverBase protected: typedef SparseSolverBase Base; using Base::m_isInitialized; - + public: typedef typename internal::traits::MatrixType MatrixType; typedef typename internal::traits::Preconditioner Preconditioner; @@ -169,10 +169,10 @@ class IterativeSolverBase : public SparseSolverBase } /** Initialize the solver with matrix \a A for further \c Ax=b solving. - * + * * This constructor is a shortcut for the default constructor followed * by a call to compute(). - * + * * \warning this class stores a reference to the matrix A as well as some * precomputed values that depend on it. Therefore, if \a A is changed * this class becomes invalid. Call compute() to update it with the new @@ -187,7 +187,7 @@ class IterativeSolverBase : public SparseSolverBase } ~IterativeSolverBase() {} - + /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems. * * Currently, this function mostly calls analyzePattern on the preconditioner. In the future @@ -203,7 +203,7 @@ class IterativeSolverBase : public SparseSolverBase m_info = m_preconditioner.info(); return derived(); } - + /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems. * * Currently, this function mostly calls factorize on the preconditioner. @@ -216,7 +216,7 @@ class IterativeSolverBase : public SparseSolverBase template Derived& factorize(const EigenBase& A) { - eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); + eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); grab(A.derived()); m_preconditioner.factorize(matrix()); m_factorizationIsOk = true; @@ -247,16 +247,16 @@ class IterativeSolverBase : public SparseSolverBase } /** \internal */ - Index rows() const { return matrix().rows(); } + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); } /** \internal */ - Index cols() const { return matrix().cols(); } + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); } /** \returns the tolerance threshold used by the stopping criteria. * \sa setTolerance() */ RealScalar tolerance() const { return m_tolerance; } - + /** Sets the tolerance threshold used by the stopping criteria. * * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|. @@ -270,19 +270,19 @@ class IterativeSolverBase : public SparseSolverBase /** \returns a read-write reference to the preconditioner for custom configuration. */ Preconditioner& preconditioner() { return m_preconditioner; } - + /** \returns a read-only reference to the preconditioner. */ const Preconditioner& preconditioner() const { return m_preconditioner; } /** \returns the max number of iterations. - * It is either the value setted by setMaxIterations or, by default, + * It is either the value set by setMaxIterations or, by default, * twice the number of columns of the matrix. */ Index maxIterations() const { return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations; } - + /** Sets the max number of iterations. * Default is twice the number of columns of the matrix. */ @@ -328,13 +328,13 @@ class IterativeSolverBase : public SparseSolverBase eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized."); return m_info; } - + /** \internal */ template - void _solve_impl(const Rhs& b, SparseMatrixBase &aDest) const + void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase &aDest) const { eigen_assert(rows()==b.rows()); - + Index rhsCols = b.cols(); Index size = b.rows(); DestDerived& dest(aDest.derived()); @@ -344,15 +344,65 @@ class IterativeSolverBase : public SparseSolverBase // We do not directly fill dest because sparse expressions have to be free of aliasing issue. // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other. typename DestDerived::PlainObject tmp(cols(),rhsCols); + ComputationInfo global_info = Success; for(Index k=0; k + typename internal::enable_if::type + _solve_with_guess_impl(const Rhs& b, MatrixBase &aDest) const + { + eigen_assert(rows()==b.rows()); + + Index rhsCols = b.cols(); + DestDerived& dest(aDest.derived()); + ComputationInfo global_info = Success; + for(Index k=0; k + typename internal::enable_if::type + _solve_with_guess_impl(const Rhs& b, MatrixBase &dest) const + { + derived()._solve_vector_with_guess_impl(b,dest.derived()); + } + + /** \internal default initial guess = 0 */ + template + void _solve_impl(const Rhs& b, Dest& x) const + { + x.setZero(); + derived()._solve_with_guess_impl(b,x); + } + protected: void init() { @@ -370,19 +420,19 @@ class IterativeSolverBase : public SparseSolverBase { return m_matrixWrapper.matrix(); } - + template void grab(const InputType &A) { m_matrixWrapper.grab(A); } - + MatrixWrapper m_matrixWrapper; Preconditioner m_preconditioner; Index m_maxIterations; RealScalar m_tolerance; - + mutable RealScalar m_error; mutable Index m_iterations; mutable ComputationInfo m_info; diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/inst/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h index 0aea0e09..203fd0ec 100644 --- a/inst/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +++ b/inst/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h @@ -182,32 +182,14 @@ class LeastSquaresConjugateGradient : public IterativeSolverBase - void _solve_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const { m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; - for(Index j=0; j - void _solve_impl(const MatrixBase& b, Dest& x) const - { - x.setZero(); - _solve_with_guess_impl(b.derived(),x); - } }; diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/inst/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h index 0ace4517..7b896575 100644 --- a/inst/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +++ b/inst/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h @@ -13,7 +13,7 @@ namespace Eigen { template class SolveWithGuess; - + /** \class SolveWithGuess * \ingroup IterativeLinearSolvers_Module * @@ -45,13 +45,15 @@ class SolveWithGuess : public internal::generic_xpr_base::PlainObject PlainObject; typedef typename internal::generic_xpr_base, MatrixXpr, typename internal::traits::StorageKind>::type Base; typedef typename internal::ref_selector::type Nested; - + SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess) : m_dec(dec), m_rhs(rhs), m_guess(guess) {} - - EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; } EIGEN_DEVICE_FUNC const RhsType& rhs() const { return m_rhs; } @@ -61,7 +63,7 @@ class SolveWithGuess : public internal::generic_xpr_base > m_result = solve.guess(); solve.dec()._solve_with_guess_impl(solve.rhs(), m_result); } - -protected: + +protected: PlainObject m_result; }; @@ -108,7 +110,7 @@ struct Assignment, interna } }; -} // end namepsace internal +} // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/Jacobi/Jacobi.h b/inst/include/Eigen/src/Jacobi/Jacobi.h index 1998c632..76668a57 100644 --- a/inst/include/Eigen/src/Jacobi/Jacobi.h +++ b/inst/include/Eigen/src/Jacobi/Jacobi.h @@ -11,7 +11,7 @@ #ifndef EIGEN_JACOBI_H #define EIGEN_JACOBI_H -namespace Eigen { +namespace Eigen { /** \ingroup Jacobi_Module * \jacobi_module @@ -37,17 +37,20 @@ template class JacobiRotation typedef typename NumTraits::Real RealScalar; /** Default constructor without any initialization. */ + EIGEN_DEVICE_FUNC JacobiRotation() {} /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */ + EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {} - Scalar& c() { return m_c; } - Scalar c() const { return m_c; } - Scalar& s() { return m_s; } - Scalar s() const { return m_s; } + EIGEN_DEVICE_FUNC Scalar& c() { return m_c; } + EIGEN_DEVICE_FUNC Scalar c() const { return m_c; } + EIGEN_DEVICE_FUNC Scalar& s() { return m_s; } + EIGEN_DEVICE_FUNC Scalar s() const { return m_s; } /** Concatenates two planar rotation */ + EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) { using numext::conj; @@ -56,19 +59,26 @@ template class JacobiRotation } /** Returns the transposed transformation */ + EIGEN_DEVICE_FUNC JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); } /** Returns the adjoint transformation */ + EIGEN_DEVICE_FUNC JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); } template + EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase&, Index p, Index q); + EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0); protected: + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type); Scalar m_c, m_s; @@ -80,10 +90,12 @@ template class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; using std::abs; + RealScalar deno = RealScalar(2)*abs(y); if(deno < (std::numeric_limits::min)()) { @@ -123,6 +135,7 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co */ template template +EIGEN_DEVICE_FUNC inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -145,6 +158,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); @@ -153,12 +167,13 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for complexes template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; using std::abs; using numext::conj; - + if(q==Scalar(0)) { m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1); @@ -212,6 +227,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -257,12 +273,13 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar namespace internal { /** \jacobi_module - * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y: + * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y: * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right ) = J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$ * * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j); } @@ -274,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRotation& j) { RowXpr x(this->row(p)); @@ -289,6 +307,7 @@ inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRo */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiRotation& j) { ColXpr x(this->col(p)); @@ -302,7 +321,8 @@ template struct apply_rotation_in_the_plane_selector { - static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + static EIGEN_DEVICE_FUNC + inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) { for(Index i=0; i +EIGEN_DEVICE_FUNC void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit) + const bool Vectorizable = (int(VectorX::Flags) & int(VectorY::Flags) & PacketAccessBit) && (int(packet_traits::size) == int(packet_traits::size)); eigen_assert(xpr_x.size() == xpr_y.size()); @@ -442,7 +463,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0); Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0); - + OtherScalar c = j.c(); OtherScalar s = j.s(); if (c==OtherScalar(1) && s==OtherScalar(0)) diff --git a/inst/include/Eigen/src/KLUSupport/KLUSupport.h b/inst/include/Eigen/src/KLUSupport/KLUSupport.h new file mode 100644 index 00000000..215db35b --- /dev/null +++ b/inst/include/Eigen/src/KLUSupport/KLUSupport.h @@ -0,0 +1,358 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Kyle Macfarlan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_H +#define EIGEN_KLUSUPPORT_H + +namespace Eigen { + +/* TODO extract L, extract U, compute det, etc... */ + +/** \ingroup KLUSupport_Module + * \brief A sparse LU factorization and solver based on KLU + * + * This class allows to solve for A.X = B sparse linear problems via a LU factorization + * using the KLU library. The sparse matrix A must be squared and full rank. + * The vectors or matrices X and B can be either dense or sparse. + * + * \warning The input matrix A should be in a \b compressed and \b column-major form. + * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. + * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \implsparsesolverconcept + * + * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU + */ + + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) { + return klu_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); +} + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) { + return klu_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), 0, Common); +} + +inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) { + return klu_factor(Ap, Ai, Ax, Symbolic, Common); +} + +inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex) { + return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common); +} + + +template +class KLU : public SparseSolverBase > +{ + protected: + typedef SparseSolverBase > Base; + using Base::m_isInitialized; + public: + using Base::_solve_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef Matrix Vector; + typedef Matrix IntRowVectorType; + typedef Matrix IntColVectorType; + typedef SparseMatrix LUMatrixType; + typedef SparseMatrix KLUMatrixType; + typedef Ref KLUMatrixRef; + enum { + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + + public: + + KLU() + : m_dummy(0,0), mp_matrix(m_dummy) + { + init(); + } + + template + explicit KLU(const InputMatrixType& matrix) + : mp_matrix(matrix) + { + init(); + compute(matrix); + } + + ~KLU() + { + if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common); + if(m_numeric) klu_free_numeric(&m_numeric,&m_common); + } + + EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); } + EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); } + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was successful, + * \c NumericalIssue if the matrix.appears to be negative. + */ + ComputationInfo info() const + { + eigen_assert(m_isInitialized && "Decomposition is not initialized."); + return m_info; + } +#if 0 // not implemented yet + inline const LUMatrixType& matrixL() const + { + if (m_extractedDataAreDirty) extractData(); + return m_l; + } + + inline const LUMatrixType& matrixU() const + { + if (m_extractedDataAreDirty) extractData(); + return m_u; + } + + inline const IntColVectorType& permutationP() const + { + if (m_extractedDataAreDirty) extractData(); + return m_p; + } + + inline const IntRowVectorType& permutationQ() const + { + if (m_extractedDataAreDirty) extractData(); + return m_q; + } +#endif + /** Computes the sparse Cholesky decomposition of \a matrix + * Note that the matrix should be column-major, and in compressed format for best performance. + * \sa SparseMatrix::makeCompressed(). + */ + template + void compute(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + grab(matrix.derived()); + analyzePattern_impl(); + factorize_impl(); + } + + /** Performs a symbolic decomposition on the sparcity of \a matrix. + * + * This function is particularly useful when solving for several problems having the same structure. + * + * \sa factorize(), compute() + */ + template + void analyzePattern(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + + grab(matrix.derived()); + + analyzePattern_impl(); + } + + + /** Provides access to the control settings array used by KLU. + * + * See KLU documentation for details. + */ + inline const klu_common& kluCommon() const + { + return m_common; + } + + /** Provides access to the control settings array used by UmfPack. + * + * If this array contains NaN's, the default values are used. + * + * See KLU documentation for details. + */ + inline klu_common& kluCommon() + { + return m_common; + } + + /** Performs a numeric decomposition of \a matrix + * + * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. + * + * \sa analyzePattern(), compute() + */ + template + void factorize(const InputMatrixType& matrix) + { + eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()"); + if(m_numeric) + klu_free_numeric(&m_numeric,&m_common); + + grab(matrix.derived()); + + factorize_impl(); + } + + /** \internal */ + template + bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; + +#if 0 // not implemented yet + Scalar determinant() const; + + void extractData() const; +#endif + + protected: + + void init() + { + m_info = InvalidInput; + m_isInitialized = false; + m_numeric = 0; + m_symbolic = 0; + m_extractedDataAreDirty = true; + + klu_defaults(&m_common); + } + + void analyzePattern_impl() + { + m_info = InvalidInput; + m_analysisIsOk = false; + m_factorizationIsOk = false; + m_symbolic = klu_analyze(internal::convert_index(mp_matrix.rows()), + const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), + &m_common); + if (m_symbolic) { + m_isInitialized = true; + m_info = Success; + m_analysisIsOk = true; + m_extractedDataAreDirty = true; + } + } + + void factorize_impl() + { + + m_numeric = klu_factor(const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), const_cast(mp_matrix.valuePtr()), + m_symbolic, &m_common, Scalar()); + + + m_info = m_numeric ? Success : NumericalIssue; + m_factorizationIsOk = m_numeric ? 1 : 0; + m_extractedDataAreDirty = true; + } + + template + void grab(const EigenBase &A) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A.derived()); + } + + void grab(const KLUMatrixRef &A) + { + if(&(A.derived()) != &mp_matrix) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A); + } + } + + // cached data to reduce reallocation, etc. +#if 0 // not implemented yet + mutable LUMatrixType m_l; + mutable LUMatrixType m_u; + mutable IntColVectorType m_p; + mutable IntRowVectorType m_q; +#endif + + KLUMatrixType m_dummy; + KLUMatrixRef mp_matrix; + + klu_numeric* m_numeric; + klu_symbolic* m_symbolic; + klu_common m_common; + mutable ComputationInfo m_info; + int m_factorizationIsOk; + int m_analysisIsOk; + mutable bool m_extractedDataAreDirty; + + private: + KLU(const KLU& ) { } +}; + +#if 0 // not implemented yet +template +void KLU::extractData() const +{ + if (m_extractedDataAreDirty) + { + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + + // get size of the data + int lnz, unz, rows, cols, nz_udiag; + umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); + + // allocate data + m_l.resize(rows,(std::min)(rows,cols)); + m_l.resizeNonZeros(lnz); + + m_u.resize((std::min)(rows,cols),cols); + m_u.resizeNonZeros(unz); + + m_p.resize(rows); + m_q.resize(cols); + + // extract + umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), + m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), + m_p.data(), m_q.data(), 0, 0, 0, m_numeric); + + m_extractedDataAreDirty = false; + } +} + +template +typename KLU::Scalar KLU::determinant() const +{ + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + return Scalar(); +} +#endif + +template +template +bool KLU::_solve_impl(const MatrixBase &b, MatrixBase &x) const +{ + Index rhsCols = b.cols(); + EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); + + x = b; + int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); + + m_info = info!=0 ? Success : NumericalIssue; + return true; +} + +} // end namespace Eigen + +#endif // EIGEN_KLUSUPPORT_H diff --git a/inst/include/Eigen/src/LU/Determinant.h b/inst/include/Eigen/src/LU/Determinant.h index d6a3c1e5..3a41e6fc 100644 --- a/inst/include/Eigen/src/LU/Determinant.h +++ b/inst/include/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase& matrix, int a, int b, int c) { @@ -22,14 +23,6 @@ inline const typename Derived::Scalar bruteforce_det3_helper * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b)); } -template -const typename Derived::Scalar bruteforce_det4_helper -(const MatrixBase& matrix, int j, int k, int m, int n) -{ - return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1)) - * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3)); -} - template struct determinant_impl @@ -44,7 +37,8 @@ template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +46,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +55,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,15 +66,34 @@ template struct determinant_impl template struct determinant_impl { - static typename traits::Scalar run(const Derived& m) + typedef typename traits::Scalar Scalar; + static EIGEN_DEVICE_FUNC + Scalar run(const Derived& m) + { + Scalar d2_01 = det2(m, 0, 1); + Scalar d2_02 = det2(m, 0, 2); + Scalar d2_03 = det2(m, 0, 3); + Scalar d2_12 = det2(m, 1, 2); + Scalar d2_13 = det2(m, 1, 3); + Scalar d2_23 = det2(m, 2, 3); + Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12); + Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02); + Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01); + Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01); + return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) + + internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3); + } +protected: + static EIGEN_DEVICE_FUNC + Scalar det2(const Derived& m, Index i0, Index i1) + { + return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1); + } + + static EIGEN_DEVICE_FUNC + Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2) { - // trick by Martin Costabel to compute 4x4 det with only 30 muls - return bruteforce_det4_helper(m,0,1,2,3) - - bruteforce_det4_helper(m,0,2,1,3) - + bruteforce_det4_helper(m,0,3,1,2) - + bruteforce_det4_helper(m,1,2,0,3) - - bruteforce_det4_helper(m,1,3,0,2) - + bruteforce_det4_helper(m,2,3,0,1); + return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2)); } }; @@ -89,6 +104,7 @@ template struct determinant_impl * \returns the determinant of this matrix */ template +EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar MatrixBase::determinant() const { eigen_assert(rows() == cols()); diff --git a/inst/include/Eigen/src/LU/FullPivLU.h b/inst/include/Eigen/src/LU/FullPivLU.h index 03b6af70..ba1749fa 100644 --- a/inst/include/Eigen/src/LU/FullPivLU.h +++ b/inst/include/Eigen/src/LU/FullPivLU.h @@ -18,6 +18,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -48,12 +49,12 @@ template struct traits > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an exemple, here is how the original matrix can be retrieved: + * As an example, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ template class FullPivLU @@ -62,9 +63,9 @@ template class FullPivLU public: typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -218,6 +219,7 @@ template class FullPivLU return internal::image_retval(*this, originalMatrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \return a solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -237,14 +239,10 @@ template class FullPivLU * * \sa TriangularView::solve(), kernel(), inverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -320,7 +318,7 @@ template class FullPivLU return m_usePrescribedThreshold ? m_prescribedThreshold // this formula comes from experimenting (see "LU precision tuning" thread on the list) // and turns out to be identical to Higham's formula used already in LDLt. - : NumTraits::epsilon() * m_lu.diagonalSize(); + : NumTraits::epsilon() * RealScalar(m_lu.diagonalSize()); } /** \returns the rank of the matrix of which *this is the LU decomposition. @@ -406,16 +404,16 @@ template class FullPivLU MatrixType reconstructedMatrix() const; - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template - EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif @@ -531,8 +529,8 @@ void FullPivLU::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; ++i) { - m_rowsTranspositions.coeffRef(i) = i; - m_colsTranspositions.coeffRef(i) = i; + m_rowsTranspositions.coeffRef(i) = internal::convert_index(i); + m_colsTranspositions.coeffRef(i) = internal::convert_index(i); } break; } @@ -543,8 +541,8 @@ void FullPivLU::computeInPlace() // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). - m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner; - m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner; + m_rowsTranspositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_colsTranspositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner)); ++number_of_transpositions; @@ -757,7 +755,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == rows); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -807,7 +804,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == cols); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -821,29 +817,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType // Step 1 c = permutationQ().inverse() * rhs; - if (Conjugate) { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(smalldim)); - } else { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(smalldim)); - } + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(smalldim)); // Step 4 PermutationPType invp = permutationP().inverse().eval(); diff --git a/inst/include/Eigen/src/LU/InverseImpl.h b/inst/include/Eigen/src/LU/InverseImpl.h index f49f2336..a40cefa9 100644 --- a/inst/include/Eigen/src/LU/InverseImpl.h +++ b/inst/include/Eigen/src/LU/InverseImpl.h @@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper( const MatrixType& matrix, const typename ResultType::Scalar& invdet, ResultType& result) { + typename ResultType::Scalar temp = matrix.coeff(0,0); result.coeffRef(0,0) = matrix.coeff(1,1) * invdet; result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet; result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet; - result.coeffRef(1,1) = matrix.coeff(0,0) * invdet; + result.coeffRef(1,1) = temp * invdet; } template @@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper( const Matrix& cofactors_col0, ResultType& result) { - result.row(0) = cofactors_col0 * invdet; - result.coeffRef(1,0) = cofactor_3x3(matrix) * invdet; - result.coeffRef(1,1) = cofactor_3x3(matrix) * invdet; + // Compute cofactors in a way that avoids aliasing issues. + typedef typename ResultType::Scalar Scalar; + const Scalar c01 = cofactor_3x3(matrix) * invdet; + const Scalar c11 = cofactor_3x3(matrix) * invdet; + const Scalar c02 = cofactor_3x3(matrix) * invdet; result.coeffRef(1,2) = cofactor_3x3(matrix) * invdet; - result.coeffRef(2,0) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,1) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,2) = cofactor_3x3(matrix) * invdet; + result.coeffRef(1,0) = c01; + result.coeffRef(1,1) = c11; + result.coeffRef(2,0) = c02; + result.row(0) = cofactors_col0 * invdet; } template @@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check bool& invertible ) { - using std::abs; typedef typename ResultType::Scalar Scalar; Matrix cofactors_col0; cofactors_col0.coeffRef(0) = cofactor_3x3(matrix); cofactors_col0.coeffRef(1) = cofactor_3x3(matrix); cofactors_col0.coeffRef(2) = cofactor_3x3(matrix); determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); - invertible = abs(determinant) > absDeterminantThreshold; + invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold; if(!invertible) return; const Scalar invdet = Scalar(1) / determinant; compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); @@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check using std::abs; determinant = matrix.determinant(); invertible = abs(determinant) > absDeterminantThreshold; - if(invertible) compute_inverse::run(matrix, inverse); + if(invertible && extract_data(matrix) != extract_data(inverse)) { + compute_inverse::run(matrix, inverse); + } + else if(invertible) { + MatrixType matrix_t = matrix; + compute_inverse::run(matrix_t, inverse); + } } }; @@ -290,6 +301,7 @@ template struct Assignment, internal::assign_op, Dense2Dense> { typedef Inverse SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -332,6 +344,7 @@ struct Assignment, internal::assign_op +EIGEN_DEVICE_FUNC inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) @@ -345,6 +358,8 @@ inline const Inverse MatrixBase::inverse() const * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param determinant Reference to the variable in which to store the determinant. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. @@ -385,6 +400,8 @@ inline void MatrixBase::computeInverseAndDetWithCheck( * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. * \param absDeterminantThreshold Optional parameter controlling the invertibility check. diff --git a/inst/include/Eigen/src/LU/PartialPivLU.h b/inst/include/Eigen/src/LU/PartialPivLU.h index 6b10f39f..34aed724 100644 --- a/inst/include/Eigen/src/LU/PartialPivLU.h +++ b/inst/include/Eigen/src/LU/PartialPivLU.h @@ -19,6 +19,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; typedef traits<_MatrixType> BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, @@ -69,7 +70,7 @@ struct enable_if_ref,Derived> { * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP(). * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU */ template class PartialPivLU @@ -79,8 +80,9 @@ template class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; + EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -152,6 +154,7 @@ template class PartialPivLU return m_p; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method returns the solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -169,14 +172,10 @@ template class PartialPivLU * * \sa TriangularView::solve(), inverse(), computeInverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -217,8 +216,8 @@ template class PartialPivLU MatrixType reconstructedMatrix() const; - inline Index rows() const { return m_lu.rows(); } - inline Index cols() const { return m_lu.cols(); } + EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template @@ -231,8 +230,6 @@ template class PartialPivLU * Step 3: replace c by the solution x to Ux = c. */ - eigen_assert(rhs.rows() == m_lu.rows()); - // Step 1 dst = permutationP() * rhs; @@ -246,26 +243,21 @@ template class PartialPivLU template EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const { - /* The decomposition PA = LU can be rewritten as A = P^{-1} L U. + /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P. * So we proceed as follows: - * Step 1: compute c = Pb. - * Step 2: replace c by the solution x to Lx = c. - * Step 3: replace c by the solution x to Ux = c. + * Step 1: compute c as the solution to L^T c = b + * Step 2: replace c by the solution x to U^T x = c. + * Step 3: update c = P^-1 c. */ eigen_assert(rhs.rows() == m_lu.cols()); - if (Conjugate) { - // Step 1 - dst = m_lu.template triangularView().adjoint().solve(rhs); - // Step 2 - m_lu.template triangularView().adjoint().solveInPlace(dst); - } else { - // Step 1 - dst = m_lu.template triangularView().transpose().solve(rhs); - // Step 2 - m_lu.template triangularView().transpose().solveInPlace(dst); - } + // Step 1 + dst = m_lu.template triangularView().transpose() + .template conjugateIf().solve(rhs); + // Step 2 + m_lu.template triangularView().transpose() + .template conjugateIf().solveInPlace(dst); // Step 3 dst = permutationP().transpose() * dst; } @@ -339,17 +331,18 @@ PartialPivLU::PartialPivLU(EigenBase& matrix) namespace internal { /** \internal This is the blocked version of fullpivlu_unblocked() */ -template +template struct partial_lu_impl { - // FIXME add a stride to Map, so that the following mapping becomes easier, - // another option would be to create an expression being able to automatically - // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly - // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, - // and Block. - typedef Map > MapLU; - typedef Block MatrixType; - typedef Block BlockType; + static const int UnBlockedBound = 16; + static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; + static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; + // Remaining rows and columns at compile-time: + static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; + static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; + typedef Matrix MatrixType; + typedef Ref MatrixTypeRef; + typedef Ref > BlockType; typedef typename MatrixType::RealScalar RealScalar; /** \internal performs the LU decomposition in-place of the matrix \a lu @@ -362,19 +355,22 @@ struct partial_lu_impl * * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. */ - static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) + static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) { typedef scalar_score_coeff_op Scoring; typedef typename Scoring::result_type Score; const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows,cols); + // For small compile-time matrices it is worth processing the last row separately: + // speedup: +100% for 2x2, +10% for others. + const Index endk = UnBlockedAtCompileTime ? size-1 : size; nb_transpositions = 0; Index first_zero_pivot = -1; - for(Index k = 0; k < size; ++k) + for(Index k = 0; k < endk; ++k) { - Index rrows = rows-k-1; - Index rcols = cols-k-1; + int rrows = internal::convert_index(rows-k-1); + int rcols = internal::convert_index(cols-k-1); Index row_of_biggest_in_col; Score biggest_in_corner @@ -391,9 +387,7 @@ struct partial_lu_impl ++nb_transpositions; } - // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k) - // overflow but not the actual quotient? - lu.col(k).tail(rrows) /= lu.coeff(k,k); + lu.col(k).tail(fix(rrows)) /= lu.coeff(k,k); } else if(first_zero_pivot==-1) { @@ -403,8 +397,18 @@ struct partial_lu_impl } if(k(rrows),fix(rcols)).noalias() -= lu.col(k).tail(fix(rrows)) * lu.row(k).tail(fix(rcols)); + } + + // special handling of the last entry + if(UnBlockedAtCompileTime) + { + Index k = endk; + row_transpositions[k] = PivIndex(k); + if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1) + first_zero_pivot = k; } + return first_zero_pivot; } @@ -420,18 +424,17 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instanciations to the strict minimum - * 2 - avoid infinite recursion of the instanciations with Block > > + * 1 - reduce the number of instantiations to the strict minimum + * 2 - avoid infinite recursion of the instantiations with Block > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { - MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); - MatrixType lu(lu1,0,0,rows,cols); + MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride)); const Index size = (std::min)(rows,cols); // if the matrix is too small, no blocking: - if(size<=16) + if(UnBlockedAtCompileTime || size<=UnBlockedBound) { return unblocked_lu(lu, row_transpositions, nb_transpositions); } @@ -457,12 +460,12 @@ struct partial_lu_impl // A00 | A01 | A02 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // A20 | A21 | A22 - BlockType A_0(lu,0,0,rows,k); - BlockType A_2(lu,0,k+bs,rows,tsize); - BlockType A11(lu,k,k,bs,bs); - BlockType A12(lu,k,k+bs,bs,tsize); - BlockType A21(lu,k+bs,k,trows,bs); - BlockType A22(lu,k+bs,k+bs,trows,tsize); + BlockType A_0 = lu.block(0,0,rows,k); + BlockType A_2 = lu.block(0,k+bs,rows,tsize); + BlockType A11 = lu.block(k,k,bs,bs); + BlockType A12 = lu.block(k,k+bs,bs,tsize); + BlockType A21 = lu.block(k+bs,k,trows,bs); + BlockType A22 = lu.block(k+bs,k+bs,trows,tsize); PivIndex nb_transpositions_in_panel; // recursively call the blocked LU algorithm on [A11^T A21^T]^T @@ -501,11 +504,18 @@ struct partial_lu_impl template void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions) { + // Special-case of zero matrix. + if (lu.rows() == 0 || lu.cols() == 0) { + nb_transpositions = 0; + return; + } eigen_assert(lu.cols() == row_transpositions.size()); - eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); + eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl - + < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, + typename TranspositionType::StorageIndex, + EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)> ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); } diff --git a/inst/include/Eigen/src/LU/arch/InverseSize4.h b/inst/include/Eigen/src/LU/arch/InverseSize4.h new file mode 100644 index 00000000..a232ffc0 --- /dev/null +++ b/inst/include/Eigen/src/LU/arch/InverseSize4.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2001 Intel Corporation +// Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2009 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath. +// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M, +// adjugate of M and determinant of M respectively. M# is computed block-wise +// using specific formulae. For proof, see: +// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html +// Variable names are adopted from \src\LU\Inverse_SSE.h. +// +// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h +// comes from the following Intel's library: +// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ +// +// Here is the respective copyright and license statement: +// +// Copyright (c) 2001 Intel Corporation. +// +// Permition is granted to use, copy, distribute and prepare derivative works +// of this library for any purpose and without fee, provided, that the above +// copyright notice and this statement appear in all copies. +// Intel makes no representations about the suitability of this software for +// any purpose, and specifically disclaims all warranties. +// See LEGAL.TXT for all the legal information. +// +// TODO: Unify implementations of different data types (i.e. float and double). +#ifndef EIGEN_INVERSE_SIZE_4_H +#define EIGEN_INVERSE_SIZE_4_H + +namespace Eigen +{ +namespace internal +{ +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt(data); + Packet4f _L2 = ploadt(data + stride*4); + Packet4f _L3 = ploadt(data + stride*8); + Packet4f _L4 = ploadt(data + stride*12); + + // Four 2x2 sub-matrices of the input matrix + // input = [[A, B], + // [C, D]] + Packet4f A, B, C, D; + + if (!StorageOrdersMatch) + { + A = vec4f_unpacklo(_L1, _L2); + B = vec4f_unpacklo(_L3, _L4); + C = vec4f_unpackhi(_L1, _L2); + D = vec4f_unpackhi(_L3, _L4); + } + else + { + A = vec4f_movelh(_L1, _L2); + B = vec4f_movehl(_L2, _L1); + C = vec4f_movelh(_L3, _L4); + D = vec4f_movehl(_L4, _L3); + } + + Packet4f AB, DC; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B); + AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1))); + + // DC = D#*C + DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C); + DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1))); + + // determinants of the sub-matrices + Packet4f dA, dB, dC, dD; + + dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A); + dA = psub(dA, vec4f_movehl(dA, dA)); + + dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B); + dB = psub(dB, vec4f_movehl(dB, dB)); + + dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C); + dC = psub(dC, vec4f_movehl(dC, dC)); + + dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D); + dD = psub(dD, vec4f_movehl(dD, dD)); + + Packet4f d, d1, d2; + + d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB); + d = padd(d, vec4f_movehl(d, d)); + d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0)); + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0); + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet4f rd = pdiv(pset1(1.0f), det); + + // Four sub-matrices of the inverse + Packet4f iA, iB, iC, iD; + + // iD = D*|A| - C*A#*B + iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB)); + iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB))); + iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD); + + // iA = A*|D| - B*D#*C + iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC)); + iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC))); + iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0)); + iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1))); + iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0)); + iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1))); + iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC); + + const float sign_mask[4] = {0.0f, numext::bit_cast(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; + const Packet4f p4f_sign_PNNP = ploadu(sign_mask); + rd = pxor(rd, p4f_sign_PNNP); + iA = pmul(iA, rd); + iB = pmul(iB, rd); + iC = pmul(iC, rd); + iD = pmul(iD, rd); + + Index res_stride = result.outerStride(); + float *res = result.data(); + + pstoret(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1)); + pstoret(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0)); + pstoret(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1)); + pstoret(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0)); + } +}; + +#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG)) +// same algorithm as above, except that each operand is split into +// halves for two registers to hold. +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), + MatrixType const &, + typename MatrixType::PlainObject>::type + ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower + // row e.g. A1, upper row of A, A2, lower row of A + // input = [[A, B], = [[[A1, [B1, + // [C, D]] A2], B2]], + // [[C1, [D1, + // C2], D2]]] + + Packet2d A1, A2, B1, B2, C1, C2, D1, D2; + + const double* data = matrix.data(); + const Index stride = matrix.innerStride(); + if (StorageOrdersMatch) + { + A1 = ploadt(data + stride*0); + B1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + B2 = ploadt(data + stride*6); + C1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + C2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); + } + else + { + Packet2d temp; + A1 = ploadt(data + stride*0); + C1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + C2 = ploadt(data + stride*6); + temp = A1; + A1 = vec2d_unpacklo(A1, A2); + A2 = vec2d_unpackhi(temp, A2); + + temp = C1; + C1 = vec2d_unpacklo(C1, C2); + C2 = vec2d_unpackhi(temp, C2); + + B1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + B2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); + + temp = B1; + B1 = vec2d_unpacklo(B1, B2); + B2 = vec2d_unpackhi(temp, B2); + + temp = D1; + D1 = vec2d_unpacklo(D1, D2); + D2 = vec2d_unpackhi(temp, D2); + } + + // determinants of the sub-matrices + Packet2d dA, dB, dC, dD; + + dA = vec2d_swizzle2(A2, A2, 1); + dA = pmul(A1, dA); + dA = psub(dA, vec2d_duplane(dA, 1)); + + dB = vec2d_swizzle2(B2, B2, 1); + dB = pmul(B1, dB); + dB = psub(dB, vec2d_duplane(dB, 1)); + + dC = vec2d_swizzle2(C2, C2, 1); + dC = pmul(C1, dC); + dC = psub(dC, vec2d_duplane(dC, 1)); + + dD = vec2d_swizzle2(D2, D2, 1); + dD = pmul(D1, dD); + dD = psub(dD, vec2d_duplane(dD, 1)); + + Packet2d DC1, DC2, AB1, AB2; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB1 = pmul(B1, vec2d_duplane(A2, 1)); + AB2 = pmul(B2, vec2d_duplane(A1, 0)); + AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1))); + AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0))); + + // DC = D#*C + DC1 = pmul(C1, vec2d_duplane(D2, 1)); + DC2 = pmul(C2, vec2d_duplane(D1, 0)); + DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1))); + DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0))); + + Packet2d d1, d2; + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet2d det; + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet2d rd; + + d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0)); + d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3)); + rd = padd(d1, d2); + rd = padd(rd, vec2d_duplane(rd, 1)); + + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + det = padd(d1, d2); + det = psub(det, rd); + det = vec2d_duplane(det, 0); + rd = pdiv(pset1(1.0), det); + + // rows of four sub-matrices of the inverse + Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2; + + // iD = D*|A| - C*A#*B + iD1 = pmul(AB1, vec2d_duplane(C1, 0)); + iD2 = pmul(AB1, vec2d_duplane(C2, 0)); + iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1))); + iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1))); + dA = vec2d_duplane(dA, 0); + iD1 = psub(pmul(D1, dA), iD1); + iD2 = psub(pmul(D2, dA), iD2); + + // iA = A*|D| - B*D#*C + iA1 = pmul(DC1, vec2d_duplane(B1, 0)); + iA2 = pmul(DC1, vec2d_duplane(B2, 0)); + iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1))); + iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1))); + dD = vec2d_duplane(dD, 0); + iA1 = psub(pmul(A1, dD), iA1); + iA2 = psub(pmul(A2, dD), iA2); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1)); + iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1)); + iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2))); + iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2))); + dB = vec2d_duplane(dB, 0); + iB1 = psub(pmul(C1, dB), iB1); + iB2 = psub(pmul(C2, dB), iB2); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1)); + iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1)); + iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2))); + iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2))); + dC = vec2d_duplane(dC, 0); + iC1 = psub(pmul(B1, dC), iC1); + iC2 = psub(pmul(B2, dC), iC2); + + const double sign_mask1[2] = {0.0, numext::bit_cast(0x8000000000000000ull)}; + const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; + const Packet2d sign_PN = ploadu(sign_mask1); + const Packet2d sign_NP = ploadu(sign_mask2); + d1 = pxor(rd, sign_PN); + d2 = pxor(rd, sign_NP); + + Index res_stride = result.outerStride(); + double *res = result.data(); + pstoret(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1)); + pstoret(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2)); + pstoret(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1)); + pstoret(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2)); + pstoret(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1)); + pstoret(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2)); + pstoret(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1)); + pstoret(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2)); + } +}; +#endif +} // namespace internal +} // namespace Eigen +#endif diff --git a/inst/include/Eigen/src/LU/arch/Inverse_SSE.h b/inst/include/Eigen/src/LU/arch/Inverse_SSE.h deleted file mode 100644 index 4dce2ef2..00000000 --- a/inst/include/Eigen/src/LU/arch/Inverse_SSE.h +++ /dev/null @@ -1,338 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2001 Intel Corporation -// Copyright (C) 2010 Gael Guennebaud -// Copyright (C) 2009 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// The SSE code for the 4x4 float and double matrix inverse in this file -// comes from the following Intel's library: -// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ -// -// Here is the respective copyright and license statement: -// -// Copyright (c) 2001 Intel Corporation. -// -// Permition is granted to use, copy, distribute and prepare derivative works -// of this library for any purpose and without fee, provided, that the above -// copyright notice and this statement appear in all copies. -// Intel makes no representations about the suitability of this software for -// any purpose, and specifically disclaims all warranties. -// See LEGAL.TXT for all the legal information. - -#ifndef EIGEN_INVERSE_SSE_H -#define EIGEN_INVERSE_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000)); - - // Load the full matrix into registers - __m128 _L1 = matrix.template packet( 0); - __m128 _L2 = matrix.template packet( 4); - __m128 _L3 = matrix.template packet( 8); - __m128 _L4 = matrix.template packet(12); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register holds four matrix element, the smaller matrices are - // represented as a registers. Hence we get a better locality of the - // calculations. - - __m128 A, B, C, D; // the four sub-matrices - if(!StorageOrdersMatch) - { - A = _mm_unpacklo_ps(_L1, _L2); - B = _mm_unpacklo_ps(_L3, _L4); - C = _mm_unpackhi_ps(_L1, _L2); - D = _mm_unpackhi_ps(_L3, _L4); - } - else - { - A = _mm_movelh_ps(_L1, _L2); - B = _mm_movehl_ps(_L2, _L1); - C = _mm_movelh_ps(_L3, _L4); - D = _mm_movehl_ps(_L4, _L3); - } - - __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices - DC, AB; - __m128 dA, dB, dC, dD; // determinant of the sub-matrices - __m128 det, d, d1, d2; - __m128 rd; // reciprocal of the determinant - - // AB = A# * B - AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B); - AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E))); - // DC = D# * C - DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C); - DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E))); - - // dA = |A| - dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A); - dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA)); - // dB = |B| - dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B); - dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB)); - - // dC = |C| - dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C); - dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC)); - // dD = |D| - dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D); - dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD)); - - // d = trace(AB*DC) = trace(A#*B*D#*C) - d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB); - - // iD = C*A#*B - iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB)); - iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB))); - // iA = B*D#*C - iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC)); - iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC))); - - // d = trace(AB*DC) = trace(A#*B*D#*C) [continue] - d = _mm_add_ps(d, _mm_movehl_ps(d, d)); - d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1)); - d1 = _mm_mul_ss(dA,dD); - d2 = _mm_mul_ss(dB,dC); - - // iD = D*|A| - C*A#*B - iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD); - - // iA = A*|D| - B*D#*C; - iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_sub_ss(_mm_add_ss(d1,d2),d); - rd = _mm_div_ss(_mm_set_ss(1.0f), det); - -// #ifdef ZERO_SINGULAR -// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd); -// #endif - - // iB = D * (A#B)# = D*B#*A - iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33)); - iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66))); - // iC = A * (D#C)# = A*C#*D - iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33)); - iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); - - rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, p4f_sign_PNNP); - - // iB = C*|B| - D*B#*A - iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); - - // iC = B*|C| - A*C#*D; - iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC); - - // iX = iX / det - iA = _mm_mul_ps(rd,iA); - iB = _mm_mul_ps(rd,iB); - iC = _mm_mul_ps(rd,iC); - iD = _mm_mul_ps(rd,iD); - - Index res_stride = result.outerStride(); - float* res = result.data(); - pstoret(res+0, _mm_shuffle_ps(iA,iB,0x77)); - pstoret(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); - pstoret(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); - pstoret(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); - } - -}; - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register of the matrix holds two elements, the smaller matrices are - // consisted of two registers. Hence we get a better locality of the - // calculations. - - // the four sub-matrices - __m128d A1, A2, B1, B2, C1, C2, D1, D2; - - if(StorageOrdersMatch) - { - A1 = matrix.template packet( 0); B1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); B2 = matrix.template packet( 6); - C1 = matrix.template packet( 8); D1 = matrix.template packet(10); - C2 = matrix.template packet(12); D2 = matrix.template packet(14); - } - else - { - __m128d tmp; - A1 = matrix.template packet( 0); C1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); C2 = matrix.template packet( 6); - tmp = A1; - A1 = _mm_unpacklo_pd(A1,A2); - A2 = _mm_unpackhi_pd(tmp,A2); - tmp = C1; - C1 = _mm_unpacklo_pd(C1,C2); - C2 = _mm_unpackhi_pd(tmp,C2); - - B1 = matrix.template packet( 8); D1 = matrix.template packet(10); - B2 = matrix.template packet(12); D2 = matrix.template packet(14); - tmp = B1; - B1 = _mm_unpacklo_pd(B1,B2); - B2 = _mm_unpackhi_pd(tmp,B2); - tmp = D1; - D1 = _mm_unpacklo_pd(D1,D2); - D2 = _mm_unpackhi_pd(tmp,D2); - } - - __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices - DC1, DC2, AB1, AB2; - __m128d dA, dB, dC, dD; // determinant of the sub-matrices - __m128d det, d1, d2, rd; - - // dA = |A| - dA = _mm_shuffle_pd(A2, A2, 1); - dA = _mm_mul_pd(A1, dA); - dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3)); - // dB = |B| - dB = _mm_shuffle_pd(B2, B2, 1); - dB = _mm_mul_pd(B1, dB); - dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3)); - - // AB = A# * B - AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3)); - AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0)); - AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3))); - AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0))); - - // dC = |C| - dC = _mm_shuffle_pd(C2, C2, 1); - dC = _mm_mul_pd(C1, dC); - dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3)); - // dD = |D| - dD = _mm_shuffle_pd(D2, D2, 1); - dD = _mm_mul_pd(D1, dD); - dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3)); - - // DC = D# * C - DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3)); - DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0)); - DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3))); - DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0))); - - // rd = trace(AB*DC) = trace(A#*B*D#*C) - d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0)); - d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3)); - rd = _mm_add_pd(d1, d2); - rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3)); - - // iD = C*A#*B - iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0)); - iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0)); - iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3))); - iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3))); - - // iA = B*D#*C - iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0)); - iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0)); - iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3))); - iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3))); - - // iD = D*|A| - C*A#*B - dA = _mm_shuffle_pd(dA,dA,0); - iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1); - iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2); - - // iA = A*|D| - B*D#*C; - dD = _mm_shuffle_pd(dD,dD,0); - iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1); - iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2); - - d1 = _mm_mul_sd(dA, dD); - d2 = _mm_mul_sd(dB, dC); - - // iB = D * (A#B)# = D*B#*A - iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1)); - iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1)); - iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2))); - iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2))); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_add_sd(d1, d2); - det = _mm_sub_sd(det, rd); - - // iC = A * (D#C)# = A*C#*D - iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1)); - iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1)); - iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2))); - iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2))); - - rd = _mm_div_sd(_mm_set_sd(1.0), det); -// #ifdef ZERO_SINGULAR -// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd); -// #endif - rd = _mm_shuffle_pd(rd,rd,0); - - // iB = C*|B| - D*B#*A - dB = _mm_shuffle_pd(dB,dB,0); - iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1); - iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2); - - d1 = _mm_xor_pd(rd, _Sign_PN); - d2 = _mm_xor_pd(rd, _Sign_NP); - - // iC = B*|C| - A*C#*D; - dC = _mm_shuffle_pd(dC,dC,0); - iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); - iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - - Index res_stride = result.outerStride(); - double* res = result.data(); - pstoret(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); - pstoret(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - pstoret(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); - pstoret(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - pstoret(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); - pstoret(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - pstoret(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); - pstoret(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_INVERSE_SSE_H diff --git a/inst/include/Eigen/src/OrderingMethods/Amd.h b/inst/include/Eigen/src/OrderingMethods/Amd.h index f91ecb24..7ca3f33b 100644 --- a/inst/include/Eigen/src/OrderingMethods/Amd.h +++ b/inst/include/Eigen/src/OrderingMethods/Amd.h @@ -2,32 +2,22 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* - NOTE: this routine has been adapted from the CSparse library: Copyright (c) 2006, Timothy A. Davis. http://www.suitesparse.com -CSparse is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -CSparse is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this Module; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - +The author of CSparse, Timothy A. Davis., has executed a license with Google LLC +to permit distribution of this code and derivative works as part of Eigen under +the Mozilla Public License v. 2.0, as stated at the top of this file. */ -#include "../Core/util/NonMPL2.h" - #ifndef EIGEN_SPARSE_AMD_H #define EIGEN_SPARSE_AMD_H diff --git a/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h b/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h index da85b4d6..8e339a70 100644 --- a/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -13,115 +13,119 @@ // Davis (davis@cise.ufl.edu), University of Florida. The algorithm was // developed in collaboration with John Gilbert, Xerox PARC, and Esmond // Ng, Oak Ridge National Laboratory. -// +// // Date: -// +// // September 8, 2003. Version 2.3. -// +// // Acknowledgements: -// +// // This work was supported by the National Science Foundation, under // grants DMS-9504974 and DMS-9803599. -// +// // Notice: -// +// // Copyright (c) 1998-2003 by the University of Florida. // All Rights Reserved. -// +// // THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY // EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. -// +// // Permission is hereby granted to use, copy, modify, and/or distribute // this program, provided that the Copyright, this License, and the // Availability of the original version is retained on all copies and made // accessible to the end-user of any code or package that includes COLAMD -// or any modified version of COLAMD. -// +// or any modified version of COLAMD. +// // Availability: -// +// // The colamd/symamd library is available at -// +// // http://www.suitesparse.com - + #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H namespace internal { + +namespace Colamd { + /* Ensure that debugging is turned off: */ #ifndef COLAMD_NDEBUG #define COLAMD_NDEBUG #endif /* NDEBUG */ + + /* ========================================================================== */ /* === Knob and statistics definitions ====================================== */ /* ========================================================================== */ /* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -#define COLAMD_KNOBS 20 +const int NKnobs = 20; /* number of output statistics. Only stats [0..6] are currently used. */ -#define COLAMD_STATS 20 +const int NStats = 20; -/* knobs [0] and stats [0]: dense row knob and output statistic. */ -#define COLAMD_DENSE_ROW 0 +/* Indices into knobs and stats array. */ +enum KnobsStatsIndex { + /* knobs [0] and stats [0]: dense row knob and output statistic. */ + DenseRow = 0, -/* knobs [1] and stats [1]: dense column knob and output statistic. */ -#define COLAMD_DENSE_COL 1 + /* knobs [1] and stats [1]: dense column knob and output statistic. */ + DenseCol = 1, -/* stats [2]: memory defragmentation count output statistic */ -#define COLAMD_DEFRAG_COUNT 2 + /* stats [2]: memory defragmentation count output statistic */ + DefragCount = 2, -/* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ -#define COLAMD_STATUS 3 + /* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ + Status = 3, -/* stats [4..6]: error info, or info on jumbled columns */ -#define COLAMD_INFO1 4 -#define COLAMD_INFO2 5 -#define COLAMD_INFO3 6 + /* stats [4..6]: error info, or info on jumbled columns */ + Info1 = 4, + Info2 = 5, + Info3 = 6 +}; /* error codes returned in stats [3]: */ -#define COLAMD_OK (0) -#define COLAMD_OK_BUT_JUMBLED (1) -#define COLAMD_ERROR_A_not_present (-1) -#define COLAMD_ERROR_p_not_present (-2) -#define COLAMD_ERROR_nrow_negative (-3) -#define COLAMD_ERROR_ncol_negative (-4) -#define COLAMD_ERROR_nnz_negative (-5) -#define COLAMD_ERROR_p0_nonzero (-6) -#define COLAMD_ERROR_A_too_small (-7) -#define COLAMD_ERROR_col_length_negative (-8) -#define COLAMD_ERROR_row_index_out_of_bounds (-9) -#define COLAMD_ERROR_out_of_memory (-10) -#define COLAMD_ERROR_internal_error (-999) - +enum Status { + Ok = 0, + OkButJumbled = 1, + ErrorANotPresent = -1, + ErrorPNotPresent = -2, + ErrorNrowNegative = -3, + ErrorNcolNegative = -4, + ErrorNnzNegative = -5, + ErrorP0Nonzero = -6, + ErrorATooSmall = -7, + ErrorColLengthNegative = -8, + ErrorRowIndexOutOfBounds = -9, + ErrorOutOfMemory = -10, + ErrorInternalError = -999 +}; /* ========================================================================== */ /* === Definitions ========================================================== */ /* ========================================================================== */ -#define ONES_COMPLEMENT(r) (-(r)-1) +template +IndexType ones_complement(const IndexType r) { + return (-(r)-1); +} /* -------------------------------------------------------------------------- */ - -#define COLAMD_EMPTY (-1) +const int Empty = -1; /* Row and column status */ -#define ALIVE (0) -#define DEAD (-1) +enum RowColumnStatus { + Alive = 0, + Dead = -1 +}; /* Column status */ -#define DEAD_PRINCIPAL (-1) -#define DEAD_NON_PRINCIPAL (-2) - -/* Macros for row and column status update and checking. */ -#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) -#define COL_IS_DEAD(c) (Col [c].start < ALIVE) -#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) -#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) -#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } -#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } -#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } +enum ColumnStatus { + DeadPrincipal = -1, + DeadNonPrincipal = -2 +}; /* ========================================================================== */ /* === Colamd reporting mechanism =========================================== */ @@ -129,9 +133,9 @@ namespace internal { // == Row and Column structures == template -struct colamd_col +struct ColStructure { - IndexType start ; /* index for A of first row in this column, or DEAD */ + IndexType start ; /* index for A of first row in this column, or Dead */ /* if column is dead */ IndexType length ; /* number of rows in this column */ union @@ -159,11 +163,21 @@ struct colamd_col IndexType degree_next ; /* next column, if col is in a degree list */ IndexType hash_next ; /* next column, if col is in a hash list */ } shared4 ; - + + inline bool is_dead() const { return start < Alive; } + + inline bool is_alive() const { return start >= Alive; } + + inline bool is_dead_principal() const { return start == DeadPrincipal; } + + inline void kill_principal() { start = DeadPrincipal; } + + inline void kill_non_principal() { start = DeadNonPrincipal; } + }; - + template -struct Colamd_Row +struct RowStructure { IndexType start ; /* index for A of first col in this row */ IndexType length ; /* number of principal columns in this row */ @@ -177,13 +191,19 @@ struct Colamd_Row IndexType mark ; /* for computing set differences and marking dead rows*/ IndexType first_column ;/* first column in row (used in garbage collection) */ } shared2 ; - + + inline bool is_dead() const { return shared2.mark < Alive; } + + inline bool is_alive() const { return shared2.mark >= Alive; } + + inline void kill() { shared2.mark = Dead; } + }; - + /* ========================================================================== */ /* === Colamd recommended memory size ======================================= */ /* ========================================================================== */ - + /* The recommended length Alen of the array A passed to colamd is given by the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro. It returns -1 if any @@ -192,41 +212,41 @@ struct Colamd_Row required for the Col and Row arrays, respectively, which are internal to colamd. An additional n_col space is the minimal amount of "elbow room", and nnz/5 more space is recommended for run time efficiency. - + This macro is not needed when using symamd. - + Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid gcc -pedantic warning messages. */ template -inline IndexType colamd_c(IndexType n_col) -{ return IndexType( ((n_col) + 1) * sizeof (colamd_col) / sizeof (IndexType) ) ; } +inline IndexType colamd_c(IndexType n_col) +{ return IndexType( ((n_col) + 1) * sizeof (ColStructure) / sizeof (IndexType) ) ; } template inline IndexType colamd_r(IndexType n_row) -{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row) / sizeof (IndexType)); } +{ return IndexType(((n_row) + 1) * sizeof (RowStructure) / sizeof (IndexType)); } // Prototypes of non-user callable routines template -static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); +static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure col [], IndexType A [], IndexType p [], IndexType stats[NStats] ); template -static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); +static void init_scoring (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); template -static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); +static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); template -static void order_children (IndexType n_col, colamd_col Col [], IndexType p []); +static void order_children (IndexType n_col, ColStructure Col [], IndexType p []); template -static void detect_super_cols (colamd_col Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; +static void detect_super_cols (ColStructure Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; template -static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType *pfree) ; +static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType *pfree) ; template -static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row [] ) ; +static inline IndexType clear_mark (IndexType n_row, RowStructure Row [] ) ; /* === No debugging ========================================================= */ @@ -240,37 +260,37 @@ static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row /** - * \brief Returns the recommended value of Alen - * - * Returns recommended value of Alen for use by colamd. - * Returns -1 if any input argument is negative. - * The use of this routine or macro is optional. - * Note that the macro uses its arguments more than once, - * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. - * + * \brief Returns the recommended value of Alen + * + * Returns recommended value of Alen for use by colamd. + * Returns -1 if any input argument is negative. + * The use of this routine or macro is optional. + * Note that the macro uses its arguments more than once, + * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. + * * \param nnz nonzeros in A * \param n_row number of rows in A * \param n_col number of columns in A * \return recommended value of Alen for use by colamd */ template -inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col) +inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col) { if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0) return (-1); else - return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); + return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); } /** * \brief set default parameters The use of this routine is optional. - * - * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) + * + * Colamd: rows with more than (knobs [DenseRow] * n_col) * entries are removed prior to ordering. Columns with more than - * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to - * ordering, and placed last in the output column ordering. + * (knobs [DenseCol] * n_row) entries are removed prior to + * ordering, and placed last in the output column ordering. * - * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1, + * DenseRow and DenseCol are defined as 0 and 1, * respectively, in colamd.h. Default values of these two knobs * are both 0.5. Currently, only knobs [0] and knobs [1] are * used, but future versions may use more knobs. If so, they will @@ -279,37 +299,37 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType * not need to change, assuming that you either use * colamd_set_defaults, or pass a (double *) NULL pointer as the * knobs array to colamd or symamd. - * + * * \param knobs parameter settings for colamd */ -static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS]) +static inline void set_defaults(double knobs[NKnobs]) { /* === Local variables ================================================== */ - + int i ; if (!knobs) { return ; /* no knobs to initialize */ } - for (i = 0 ; i < COLAMD_KNOBS ; i++) + for (i = 0 ; i < NKnobs ; i++) { knobs [i] = 0 ; } - knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ - knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ + knobs [Colamd::DenseRow] = 0.5 ; /* ignore rows over 50% dense */ + knobs [Colamd::DenseCol] = 0.5 ; /* ignore columns over 50% dense */ } -/** +/** * \brief Computes a column ordering using the column approximate minimum degree ordering - * + * * Computes a column ordering (Q) of A such that P(AQ)=LU or * (AQ)'AQ=LL' have less fill-in and require fewer floating point * operations than factorizing the unpermuted matrix A or A'A, * respectively. - * - * + * + * * \param n_row number of rows in A * \param n_col number of columns in A * \param Alen, size of the array A @@ -319,143 +339,143 @@ static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS]) * \param stats colamd output statistics and error codes */ template -static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS]) +static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats]) { /* === Local variables ================================================== */ - + IndexType i ; /* loop index */ IndexType nnz ; /* nonzeros in A */ IndexType Row_size ; /* size of Row [], in integers */ IndexType Col_size ; /* size of Col [], in integers */ IndexType need ; /* minimum required length of A */ - Colamd_Row *Row ; /* pointer into A of Row [0..n_row] array */ - colamd_col *Col ; /* pointer into A of Col [0..n_col] array */ + Colamd::RowStructure *Row ; /* pointer into A of Row [0..n_row] array */ + Colamd::ColStructure *Col ; /* pointer into A of Col [0..n_col] array */ IndexType n_col2 ; /* number of non-dense, non-empty columns */ IndexType n_row2 ; /* number of non-dense, non-empty rows */ IndexType ngarbage ; /* number of garbage collections performed */ IndexType max_deg ; /* maximum row degree */ - double default_knobs [COLAMD_KNOBS] ; /* default knobs array */ - - + double default_knobs [NKnobs] ; /* default knobs array */ + + /* === Check the input arguments ======================================== */ - + if (!stats) { COLAMD_DEBUG0 (("colamd: stats not present\n")) ; return (false) ; } - for (i = 0 ; i < COLAMD_STATS ; i++) + for (i = 0 ; i < NStats ; i++) { stats [i] = 0 ; } - stats [COLAMD_STATUS] = COLAMD_OK ; - stats [COLAMD_INFO1] = -1 ; - stats [COLAMD_INFO2] = -1 ; - + stats [Colamd::Status] = Colamd::Ok ; + stats [Colamd::Info1] = -1 ; + stats [Colamd::Info2] = -1 ; + if (!A) /* A is not present */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; + stats [Colamd::Status] = Colamd::ErrorANotPresent ; COLAMD_DEBUG0 (("colamd: A not present\n")) ; return (false) ; } - + if (!p) /* p is not present */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; + stats [Colamd::Status] = Colamd::ErrorPNotPresent ; COLAMD_DEBUG0 (("colamd: p not present\n")) ; return (false) ; } - + if (n_row < 0) /* n_row must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ; - stats [COLAMD_INFO1] = n_row ; + stats [Colamd::Status] = Colamd::ErrorNrowNegative ; + stats [Colamd::Info1] = n_row ; COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; return (false) ; } - + if (n_col < 0) /* n_col must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; - stats [COLAMD_INFO1] = n_col ; + stats [Colamd::Status] = Colamd::ErrorNcolNegative ; + stats [Colamd::Info1] = n_col ; COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; return (false) ; } - + nnz = p [n_col] ; if (nnz < 0) /* nnz must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; - stats [COLAMD_INFO1] = nnz ; + stats [Colamd::Status] = Colamd::ErrorNnzNegative ; + stats [Colamd::Info1] = nnz ; COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; return (false) ; } - + if (p [0] != 0) { - stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; - stats [COLAMD_INFO1] = p [0] ; + stats [Colamd::Status] = Colamd::ErrorP0Nonzero ; + stats [Colamd::Info1] = p [0] ; COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; return (false) ; } - + /* === If no knobs, set default knobs =================================== */ - + if (!knobs) { - colamd_set_defaults (default_knobs) ; + set_defaults (default_knobs) ; knobs = default_knobs ; } - + /* === Allocate the Row and Col arrays from array A ===================== */ - + Col_size = colamd_c (n_col) ; Row_size = colamd_r (n_row) ; need = 2*nnz + n_col + Col_size + Row_size ; - + if (need > Alen) { /* not enough space in array A to perform the ordering */ - stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ; - stats [COLAMD_INFO1] = need ; - stats [COLAMD_INFO2] = Alen ; + stats [Colamd::Status] = Colamd::ErrorATooSmall ; + stats [Colamd::Info1] = need ; + stats [Colamd::Info2] = Alen ; COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); return (false) ; } - + Alen -= Col_size + Row_size ; - Col = (colamd_col *) &A [Alen] ; - Row = (Colamd_Row *) &A [Alen + Col_size] ; + Col = (ColStructure *) &A [Alen] ; + Row = (RowStructure *) &A [Alen + Col_size] ; /* === Construct the row and column data structures ===================== */ - - if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) + + if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) { /* input matrix is invalid */ COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ; return (false) ; } - + /* === Initialize scores, kill dense rows/columns ======================= */ - Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs, + Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs, &n_row2, &n_col2, &max_deg) ; - + /* === Order the supercolumns =========================================== */ - - ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p, + + ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p, n_col2, max_deg, 2*nnz) ; - + /* === Order the non-principal columns ================================== */ - - Eigen::internal::order_children (n_col, Col, p) ; - + + Colamd::order_children (n_col, Col, p) ; + /* === Return statistics in stats ======================================= */ - - stats [COLAMD_DENSE_ROW] = n_row - n_row2 ; - stats [COLAMD_DENSE_COL] = n_col - n_col2 ; - stats [COLAMD_DEFRAG_COUNT] = ngarbage ; - COLAMD_DEBUG0 (("colamd: done.\n")) ; + + stats [Colamd::DenseRow] = n_row - n_row2 ; + stats [Colamd::DenseCol] = n_col - n_col2 ; + stats [Colamd::DefragCount] = ngarbage ; + COLAMD_DEBUG0 (("colamd: done.\n")) ; return (true) ; } @@ -465,7 +485,6 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * /* There are no user-callable routines beyond this point in the file */ - /* ========================================================================== */ /* === init_rows_cols ======================================================= */ /* ========================================================================== */ @@ -485,11 +504,11 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A, of size Alen */ IndexType p [], /* pointers to columns in A, of size n_col+1 */ - IndexType stats [COLAMD_STATS] /* colamd statistics */ + IndexType stats [NStats] /* colamd statistics */ ) { /* === Local variables ================================================== */ @@ -512,24 +531,24 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ - stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = Col [col].length ; + stats [Colamd::Status] = Colamd::ErrorColLengthNegative ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = Col [col].length ; COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; return (false) ; } Col [col].shared1.thickness = 1 ; Col [col].shared2.score = 0 ; - Col [col].shared3.prev = COLAMD_EMPTY ; - Col [col].shared4.degree_next = COLAMD_EMPTY ; + Col [col].shared3.prev = Empty ; + Col [col].shared4.degree_next = Empty ; } /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ /* === Scan columns, compute row degrees, and check row indices ========= */ - stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ + stats [Info3] = 0 ; /* number of duplicate or unsorted row indices*/ for (row = 0 ; row < n_row ; row++) { @@ -551,10 +570,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* make sure row indices within range */ if (row < 0 || row >= n_row) { - stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - stats [COLAMD_INFO3] = n_row ; + stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = row ; + stats [Colamd::Info3] = n_row ; COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; return (false) ; } @@ -563,10 +582,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ { /* row index are unsorted or repeated (or both), thus col */ /* is jumbled. This is a notice, not an error condition. */ - stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - (stats [COLAMD_INFO3]) ++ ; + stats [Colamd::Status] = Colamd::OkButJumbled ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = row ; + (stats [Colamd::Info3]) ++ ; COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); } @@ -604,7 +623,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === Create row form ================================================== */ - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + if (stats [Status] == OkButJumbled) { /* if cols jumbled, watch for repeated row indices */ for (col = 0 ; col < n_col ; col++) @@ -646,7 +665,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === See if we need to re-create columns ============================== */ - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + if (stats [Status] == OkButJumbled) { COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ; @@ -701,11 +720,11 @@ static void init_scoring IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ - double knobs [COLAMD_KNOBS],/* parameters */ + double knobs [NKnobs],/* parameters */ IndexType *p_n_row2, /* number of non-dense, non-empty rows */ IndexType *p_n_col2, /* number of non-dense, non-empty columns */ IndexType *p_max_deg /* maximum row degree */ @@ -732,8 +751,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ; - dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -750,7 +769,7 @@ static void init_scoring { /* this is a empty column, kill and order it last */ Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; + Col[c].kill_principal() ; } } COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ; @@ -761,7 +780,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip any dead columns */ - if (COL_IS_DEAD (c)) + if (Col[c].is_dead()) { continue ; } @@ -777,7 +796,7 @@ static void init_scoring { Row [*cp++].shared1.degree-- ; } - KILL_PRINCIPAL_COL (c) ; + Col[c].kill_principal() ; } } COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ; @@ -791,7 +810,7 @@ static void init_scoring if (deg > dense_row_count || deg == 0) { /* kill a dense or empty row */ - KILL_ROW (r) ; + Row[r].kill() ; --n_row2 ; } else @@ -813,7 +832,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip dead column */ - if (COL_IS_DEAD (c)) + if (Col[c].is_dead()) { continue ; } @@ -826,7 +845,7 @@ static void init_scoring /* get a row */ row = *cp++ ; /* skip if dead */ - if (ROW_IS_DEAD (row)) + if (Row[row].is_dead()) { continue ; } @@ -845,7 +864,7 @@ static void init_scoring /* and have already been killed) */ COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ; Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; + Col[c].kill_principal() ; } else { @@ -870,7 +889,7 @@ static void init_scoring /* clear the hash buckets */ for (c = 0 ; c <= n_col ; c++) { - head [c] = COLAMD_EMPTY ; + head [c] = Empty ; } min_score = n_col ; /* place in reverse order, so low column indices are at the front */ @@ -878,7 +897,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* only add principal columns to degree lists */ - if (COL_IS_ALIVE (c)) + if (Col[c].is_alive()) { COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n", c, Col [c].shared2.score, min_score, n_col)) ; @@ -891,16 +910,16 @@ static void init_scoring COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (score >= 0) ; COLAMD_ASSERT (score <= n_col) ; - COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [score] >= Empty) ; /* now add this column to dList at proper score location */ next_col = head [score] ; - Col [c].shared3.prev = COLAMD_EMPTY ; + Col [c].shared3.prev = Empty ; Col [c].shared4.degree_next = next_col ; /* if there already was a column with the same score, set its */ /* previous pointer to this new column */ - if (next_col != COLAMD_EMPTY) + if (next_col != Empty) { Col [next_col].shared3.prev = c ; } @@ -939,8 +958,8 @@ static IndexType find_ordering /* return the number of garbage collections */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ IndexType Alen, /* size of A, 2*nnz + n_col or larger */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ IndexType n_col2, /* Remaining columns to order */ @@ -986,7 +1005,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* === Initialization and clear mark ==================================== */ max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; min_score = 0 ; ngarbage = 0 ; COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ; @@ -1001,10 +1020,10 @@ static IndexType find_ordering /* return the number of garbage collections */ /* make sure degree list isn't empty */ COLAMD_ASSERT (min_score >= 0) ; COLAMD_ASSERT (min_score <= n_col) ; - COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [min_score] >= Empty) ; /* get pivot column from head of minimum degree list */ - while (min_score < n_col && head [min_score] == COLAMD_EMPTY) + while (min_score < n_col && head [min_score] == Empty) { min_score++ ; } @@ -1012,12 +1031,12 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ; next_col = Col [pivot_col].shared4.degree_next ; head [min_score] = next_col ; - if (next_col != COLAMD_EMPTY) + if (next_col != Empty) { - Col [next_col].shared3.prev = COLAMD_EMPTY ; + Col [next_col].shared3.prev = Empty ; } - COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ; + COLAMD_ASSERT (Col[pivot_col].is_alive()) ; COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ; /* remember score for defrag check */ @@ -1036,12 +1055,12 @@ static IndexType find_ordering /* return the number of garbage collections */ needed_memory = numext::mini(pivot_col_score, n_col - k) ; if (pfree + needed_memory >= Alen) { - pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; + pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; ngarbage++ ; /* after garbage collection we will have enough */ COLAMD_ASSERT (pfree + needed_memory < Alen) ; /* garbage collection has wiped out the Row[].shared2.mark array */ - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; } @@ -1064,9 +1083,9 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; + COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ; /* skip if row is dead */ - if (ROW_IS_DEAD (row)) + if (Row[row].is_dead()) { continue ; } @@ -1078,7 +1097,7 @@ static IndexType find_ordering /* return the number of garbage collections */ col = *rp++ ; /* add the column, if alive and untagged */ col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && COL_IS_ALIVE (col)) + if (col_thickness > 0 && Col[col].is_alive()) { /* tag column in pivot row */ Col [col].shared1.thickness = -col_thickness ; @@ -1105,7 +1124,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* may be killing an already dead row */ row = *cp++ ; COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ; - KILL_ROW (row) ; + Row[row].kill() ; } /* === Select a row index to use as the new pivot row =============== */ @@ -1120,7 +1139,7 @@ static IndexType find_ordering /* return the number of garbage collections */ else { /* there is no pivot row, since it is of zero length */ - pivot_row = COLAMD_EMPTY ; + pivot_row = Empty ; COLAMD_ASSERT (pivot_row_length == 0) ; } COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ; @@ -1157,7 +1176,7 @@ static IndexType find_ordering /* return the number of garbage collections */ while (rp < rp_end) { col = *rp++ ; - COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; COLAMD_DEBUG3 (("Col: %d\n", col)) ; /* clear tags used to construct pivot row pattern */ @@ -1172,8 +1191,8 @@ static IndexType find_ordering /* return the number of garbage collections */ next_col = Col [col].shared4.degree_next ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ; - if (prev_col == COLAMD_EMPTY) + COLAMD_ASSERT (cur_score >= Empty) ; + if (prev_col == Empty) { head [cur_score] = next_col ; } @@ -1181,7 +1200,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { Col [prev_col].shared4.degree_next = next_col ; } - if (next_col != COLAMD_EMPTY) + if (next_col != Empty) { Col [next_col].shared3.prev = prev_col ; } @@ -1194,12 +1213,12 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) + if (Row[row].is_dead()) { continue ; } + row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row != pivot_row) ; set_difference = row_mark - tag_mark ; /* check if the row has been seen yet */ @@ -1215,7 +1234,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (set_difference == 0) { COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ; - KILL_ROW (row) ; + Row[row].kill() ; } else { @@ -1237,7 +1256,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a column */ col = *rp++ ; - COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; hash = 0 ; cur_score = 0 ; cp = &A [Col [col].start] ; @@ -1252,12 +1271,12 @@ static IndexType find_ordering /* return the number of garbage collections */ /* get a row */ row = *cp++ ; COLAMD_ASSERT(row >= 0 && row < n_row) ; - row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) + if (Row [row].is_dead()) { continue ; } + row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row_mark > tag_mark) ; /* compact the column */ *new_cp++ = row ; @@ -1278,7 +1297,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ; /* nothing left but the pivot row in this column */ - KILL_PRINCIPAL_COL (col) ; + Col[col].kill_principal() ; pivot_row_degree -= Col [col].shared1.thickness ; COLAMD_ASSERT (pivot_row_degree >= 0) ; /* order it */ @@ -1302,7 +1321,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (hash <= n_col) ; head_column = head [hash] ; - if (head_column > COLAMD_EMPTY) + if (head_column > Empty) { /* degree list "hash" is non-empty, use prev (shared3) of */ /* first column in degree list as head of hash bucket */ @@ -1319,7 +1338,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* save hash function in Col [col].shared3.hash */ Col [col].shared3.hash = (IndexType) hash ; - COLAMD_ASSERT (COL_IS_ALIVE (col)) ; + COLAMD_ASSERT (Col[col].is_alive()) ; } } @@ -1329,11 +1348,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ; - Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; + Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; /* === Kill the pivotal column ====================================== */ - KILL_PRINCIPAL_COL (pivot_col) ; + Col[pivot_col].kill_principal() ; /* === Clear mark =================================================== */ @@ -1341,7 +1360,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (tag_mark >= max_mark) { COLAMD_DEBUG2 (("clearing tag_mark\n")) ; - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; } /* === Finalize the new pivot row, and column scores ================ */ @@ -1357,7 +1376,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { col = *rp++ ; /* skip dead columns */ - if (COL_IS_DEAD (col)) + if (Col[col].is_dead()) { continue ; } @@ -1391,11 +1410,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [cur_score] >= Empty) ; next_col = head [cur_score] ; Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = COLAMD_EMPTY ; - if (next_col != COLAMD_EMPTY) + Col [col].shared3.prev = Empty ; + if (next_col != Empty) { Col [next_col].shared3.prev = col ; } @@ -1448,7 +1467,7 @@ static inline void order_children /* === Parameters ======================================================= */ IndexType n_col, /* number of columns of A */ - colamd_col Col [], /* of size n_col+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType p [] /* p [0 ... n_col-1] is the column permutation*/ ) { @@ -1464,15 +1483,15 @@ static inline void order_children for (i = 0 ; i < n_col ; i++) { /* find an un-ordered non-principal column */ - COLAMD_ASSERT (COL_IS_DEAD (i)) ; - if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY) + COLAMD_ASSERT (col_is_dead(Col, i)) ; + if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty) { parent = i ; /* once found, find its principal parent */ do { parent = Col [parent].shared1.parent ; - } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; + } while (!Col[parent].is_dead_principal()) ; /* now, order all un-ordered non-principal columns along path */ /* to this parent. collapse tree at the same time */ @@ -1482,7 +1501,7 @@ static inline void order_children do { - COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ; + COLAMD_ASSERT (Col [c].shared2.order == Empty) ; /* order this column */ Col [c].shared2.order = order++ ; @@ -1493,9 +1512,9 @@ static inline void order_children c = Col [c].shared1.parent ; /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ + /* guaranteed not to be anymore unordered columns */ /* above an ordered column */ - } while (Col [c].shared2.order == COLAMD_EMPTY) ; + } while (Col [c].shared2.order == Empty) ; /* re-order the super_col parent to largest order for this group */ Col [parent].shared2.order = order ; @@ -1547,8 +1566,8 @@ template static void detect_super_cols ( /* === Parameters ======================================================= */ - - colamd_col Col [], /* of size n_col+1 */ + + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A */ IndexType head [], /* head of degree lists and hash buckets */ IndexType row_start, /* pointer to set of columns to check */ @@ -1578,7 +1597,7 @@ static void detect_super_cols while (rp < rp_end) { col = *rp++ ; - if (COL_IS_DEAD (col)) + if (Col[col].is_dead()) { continue ; } @@ -1590,7 +1609,7 @@ static void detect_super_cols /* === Get the first column in this hash bucket ===================== */ head_column = head [hash] ; - if (head_column > COLAMD_EMPTY) + if (head_column > Empty) { first_col = Col [head_column].shared3.headhash ; } @@ -1601,10 +1620,10 @@ static void detect_super_cols /* === Consider each column in the hash bucket ====================== */ - for (super_c = first_col ; super_c != COLAMD_EMPTY ; + for (super_c = first_col ; super_c != Empty ; super_c = Col [super_c].shared4.hash_next) { - COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ; + COLAMD_ASSERT (Col [super_c].is_alive()) ; COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ; length = Col [super_c].length ; @@ -1614,10 +1633,10 @@ static void detect_super_cols /* === Compare super_c with all columns after it ================ */ for (c = Col [super_c].shared4.hash_next ; - c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next) + c != Empty ; c = Col [c].shared4.hash_next) { COLAMD_ASSERT (c != super_c) ; - COLAMD_ASSERT (COL_IS_ALIVE (c)) ; + COLAMD_ASSERT (Col[c].is_alive()) ; COLAMD_ASSERT (Col [c].shared3.hash == hash) ; /* not identical if lengths or scores are different */ @@ -1635,10 +1654,10 @@ static void detect_super_cols for (i = 0 ; i < length ; i++) { /* the columns are "clean" (no dead rows) */ - COLAMD_ASSERT (ROW_IS_ALIVE (*cp1)) ; - COLAMD_ASSERT (ROW_IS_ALIVE (*cp2)) ; + COLAMD_ASSERT ( cp1->is_alive() ); + COLAMD_ASSERT ( cp2->is_alive() ); /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ + /* no gather scatter necessary */ if (*cp1++ != *cp2++) { break ; @@ -1658,9 +1677,9 @@ static void detect_super_cols Col [super_c].shared1.thickness += Col [c].shared1.thickness ; Col [c].shared1.parent = super_c ; - KILL_NON_PRINCIPAL_COL (c) ; + Col[c].kill_non_principal() ; /* order c later, in order_children() */ - Col [c].shared2.order = COLAMD_EMPTY ; + Col [c].shared2.order = Empty ; /* remove c from hash bucket */ Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; } @@ -1668,15 +1687,15 @@ static void detect_super_cols /* === Empty this hash bucket ======================================= */ - if (head_column > COLAMD_EMPTY) + if (head_column > Empty) { /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = COLAMD_EMPTY ; + Col [head_column].shared3.headhash = Empty ; } else { /* corresponding degree list "hash" is empty */ - head [hash] = COLAMD_EMPTY ; + head [hash] = Empty ; } } } @@ -1688,7 +1707,7 @@ static void detect_super_cols /* Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns + all available memory has been used while performing row merging. Returns the index of the first free position in A, after garbage collection. The time taken by this routine is linear is the size of the array A, which is itself linear in the number of nonzeros in the input matrix. @@ -1698,11 +1717,11 @@ template static IndexType garbage_collection /* returns the new value of pfree */ ( /* === Parameters ======================================================= */ - + IndexType n_row, /* number of rows */ IndexType n_col, /* number of columns */ - Colamd_Row Row [], /* row info */ - colamd_col Col [], /* column info */ + RowStructure Row [], /* row info */ + ColStructure Col [], /* column info */ IndexType A [], /* A [0 ... Alen-1] holds the matrix */ IndexType *pfree /* &A [0] ... pfree is in use */ ) @@ -1721,7 +1740,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ pdest = &A[0] ; for (c = 0 ; c < n_col ; c++) { - if (COL_IS_ALIVE (c)) + if (Col[c].is_alive()) { psrc = &A [Col [c].start] ; @@ -1732,7 +1751,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { r = *psrc++ ; - if (ROW_IS_ALIVE (r)) + if (Row[r].is_alive()) { *pdest++ = r ; } @@ -1745,22 +1764,22 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (r = 0 ; r < n_row ; r++) { - if (ROW_IS_ALIVE (r)) + if (Row[r].is_alive()) { if (Row [r].length == 0) { - /* this row is of zero length. cannot compact it, so kill it */ - COLAMD_DEBUG3 (("Defrag row kill\n")) ; - KILL_ROW (r) ; + /* this row is of zero length. cannot compact it, so kill it */ + COLAMD_DEBUG3 (("Defrag row kill\n")) ; + Row[r].kill() ; } else { - /* save first column index in Row [r].shared2.first_column */ - psrc = &A [Row [r].start] ; - Row [r].shared2.first_column = *psrc ; - COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; - /* flag the start of the row with the one's complement of row */ - *psrc = ONES_COMPLEMENT (r) ; + /* save first column index in Row [r].shared2.first_column */ + psrc = &A [Row [r].start] ; + Row [r].shared2.first_column = *psrc ; + COLAMD_ASSERT (Row[r].is_alive()) ; + /* flag the start of the row with the one's complement of row */ + *psrc = ones_complement(r) ; } } @@ -1776,11 +1795,11 @@ static IndexType garbage_collection /* returns the new value of pfree */ { psrc-- ; /* get the row index */ - r = ONES_COMPLEMENT (*psrc) ; + r = ones_complement(*psrc) ; COLAMD_ASSERT (r >= 0 && r < n_row) ; /* restore first column index */ *psrc = Row [r].shared2.first_column ; - COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; + COLAMD_ASSERT (Row[r].is_alive()) ; /* move and compact the row */ COLAMD_ASSERT (pdest <= psrc) ; @@ -1789,7 +1808,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { c = *psrc++ ; - if (COL_IS_ALIVE (c)) + if (Col[c].is_alive()) { *pdest++ = c ; } @@ -1821,7 +1840,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ /* === Parameters ======================================================= */ IndexType n_row, /* number of rows in A */ - Colamd_Row Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ + RowStructure Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ ) { /* === Local variables ================================================== */ @@ -1830,7 +1849,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ for (r = 0 ; r < n_row ; r++) { - if (ROW_IS_ALIVE (r)) + if (Row[r].is_alive()) { Row [r].shared2.mark = 0 ; } @@ -1838,6 +1857,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ return (1) ; } +} // namespace Colamd -} // namespace internal +} // namespace internal #endif diff --git a/inst/include/Eigen/src/OrderingMethods/Ordering.h b/inst/include/Eigen/src/OrderingMethods/Ordering.h index 7ea9b14d..c5789701 100644 --- a/inst/include/Eigen/src/OrderingMethods/Ordering.h +++ b/inst/include/Eigen/src/OrderingMethods/Ordering.h @@ -31,15 +31,13 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) for (int i = 0; i < C.rows(); i++) { for (typename MatrixType::InnerIterator it(C, i); it; ++it) - it.valueRef() = 0.0; + it.valueRef() = typename MatrixType::Scalar(0); } symmat = C + A; } } -#ifndef EIGEN_MPL2_ONLY - /** \ingroup OrderingMethods_Module * \class AMDOrdering * @@ -81,8 +79,6 @@ class AMDOrdering } }; -#endif // EIGEN_MPL2_ONLY - /** \ingroup OrderingMethods_Module * \class NaturalOrdering * @@ -133,17 +129,17 @@ class COLAMDOrdering StorageIndex n = StorageIndex(mat.cols()); StorageIndex nnz = StorageIndex(mat.nonZeros()); // Get the recommended value of Alen to be used by colamd - StorageIndex Alen = internal::colamd_recommended(nnz, m, n); + StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); // Set the default parameters - double knobs [COLAMD_KNOBS]; - StorageIndex stats [COLAMD_STATS]; - internal::colamd_set_defaults(knobs); + double knobs [internal::Colamd::NKnobs]; + StorageIndex stats [internal::Colamd::NStats]; + internal::Colamd::set_defaults(knobs); IndexVector p(n+1), A(Alen); for(StorageIndex i=0; i <= n; i++) p(i) = mat.outerIndexPtr()[i]; for(StorageIndex i=0; i < nnz; i++) A(i) = mat.innerIndexPtr()[i]; // Call Colamd routine to compute the ordering - StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); + StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); EIGEN_UNUSED_VARIABLE(info); eigen_assert( info && "COLAMD failed " ); diff --git a/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h b/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h index 160d8a52..37426877 100644 --- a/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the PaStiX reports a problem * \c InvalidInput if the input matrix is invalid * diff --git a/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h b/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h index 98d0e3f2..f89b79bd 100644 --- a/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h @@ -123,6 +123,7 @@ class PardisoImpl : public SparseSolverBase }; PardisoImpl() + : m_analysisIsOk(false), m_factorizationIsOk(false) { eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type"); m_iparm.setZero(); @@ -140,7 +141,7 @@ class PardisoImpl : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix appears to be negative. */ ComputationInfo info() const @@ -192,8 +193,7 @@ class PardisoImpl : public SparseSolverBase void pardisoInit(int type) { m_type = type; - EIGEN_USING_STD_MATH(abs); - bool symmetric = abs(m_type) < 10; + bool symmetric = std::abs(m_type) < 10; m_iparm[0] = 1; // No solver default m_iparm[1] = 2; // use Metis for the ordering m_iparm[2] = 0; // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??) @@ -386,14 +386,15 @@ class PardisoLU : public PardisoImpl< PardisoLU > { protected: typedef PardisoImpl Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLU >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; + using Base::compute; using Base::solve; @@ -441,14 +442,14 @@ class PardisoLLT : public PardisoImpl< PardisoLLT > { protected: typedef PardisoImpl< PardisoLLT > Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLLT >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; enum { UpLo = _UpLo }; using Base::compute; @@ -504,14 +505,14 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT > { protected: typedef PardisoImpl< PardisoLDLT > Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLDLT >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; using Base::compute; enum { UpLo = Options&(Upper|Lower) }; diff --git a/inst/include/Eigen/src/QR/ColPivHouseholderQR.h b/inst/include/Eigen/src/QR/ColPivHouseholderQR.h index a7b47d55..9b677e9b 100644 --- a/inst/include/Eigen/src/QR/ColPivHouseholderQR.h +++ b/inst/include/Eigen/src/QR/ColPivHouseholderQR.h @@ -17,6 +17,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -46,20 +49,19 @@ template struct traits > * \sa MatrixBase::colPivHouseholderQr() */ template class ColPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; typedef typename internal::plain_row_type::type IntRowVectorType; @@ -156,6 +158,7 @@ template class ColPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -172,11 +175,8 @@ template class ColPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ() const; HouseholderSequenceType matrixQ() const @@ -402,7 +402,7 @@ template class ColPivHouseholderQR */ RealScalar maxPivot() const { return m_maxpivot; } - /** \brief Reports whether the QR factorization was succesful. + /** \brief Reports whether the QR factorization was successful. * * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. @@ -416,8 +416,10 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -584,8 +586,6 @@ template template void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - const Index nonzero_pivots = nonzeroPivots(); if(nonzero_pivots == 0) @@ -596,11 +596,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs) - .setLength(nonzero_pivots) - .transpose() - ); + c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() ); m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) .template triangularView() @@ -609,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i); for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero(); } + +template +template +void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index nonzero_pivots = nonzeroPivots(); + + if(nonzero_pivots == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs); + + m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots); + dst.bottomRows(rows()-nonzero_pivots).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf() ); +} #endif namespace internal { diff --git a/inst/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/inst/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 34c637b7..486d3373 100644 --- a/inst/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/inst/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -16,6 +16,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -44,19 +47,21 @@ struct traits > * * \sa MatrixBase::completeOrthogonalDecomposition() */ -template -class CompleteOrthogonalDecomposition { +template class CompleteOrthogonalDecomposition + : public SolverBase > +{ public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + + template + friend struct internal::solve_assertion; + + EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; @@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition { m_temp(matrix.cols()) { computeInPlace(); - } - + } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method computes the minimum-norm solution X to a least squares * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of * which \c *this is the complete orthogonal decomposition. @@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition { */ template inline const Solve solve( - const MatrixBase& b) const { - eigen_assert(m_cpqr.m_isInitialized && - "CompleteOrthogonalDecomposition is not initialized."); - return Solve(*this, b.derived()); - } + const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ(void) const; HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } @@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition { */ MatrixType matrixZ() const { MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols()); - applyZAdjointOnTheLeftInPlace(Z); - return Z.adjoint(); + applyZOnTheLeftInPlace(Z); + return Z; } /** \returns a reference to the matrix where the complete orthogonal @@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition { */ inline const Inverse pseudoInverse() const { + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); return Inverse(*this); } @@ -353,7 +356,7 @@ class CompleteOrthogonalDecomposition { inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } /** \brief Reports whether the complete orthogonal decomposition was - * succesful. + * successful. * * \note This function always returns \c Success. It is provided for * compatibility @@ -367,7 +370,10 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; + void _solve_impl(const RhsType& rhs, DstType& dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -375,8 +381,22 @@ class CompleteOrthogonalDecomposition { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b"); + } + void computeInPlace(); + /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or + * \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate + * is set to \c true. + */ + template + void applyZOnTheLeftInPlace(Rhs& rhs) const; + /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. */ template @@ -452,7 +472,7 @@ void CompleteOrthogonalDecomposition::computeInPlace() // Apply Z(k) to the first k rows of X_k m_cpqr.m_qr.topRightCorner(k, cols - rank + 1) .applyHouseholderOnTheRight( - m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k), + m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k), &m_temp(0)); } if (k != rank - 1) { @@ -464,6 +484,28 @@ void CompleteOrthogonalDecomposition::computeInPlace() } } +template +template +void CompleteOrthogonalDecomposition::applyZOnTheLeftInPlace( + Rhs& rhs) const { + const Index cols = this->cols(); + const Index nrhs = rhs.cols(); + const Index rank = this->rank(); + Matrix temp((std::max)(cols, nrhs)); + for (Index k = rank-1; k >= 0; --k) { + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + rhs.middleRows(rank - 1, cols - rank + 1) + .applyHouseholderOnTheLeft( + matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf(), zCoeffs().template conjugateIf()(k), + &temp(0)); + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + } +} + template template void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( @@ -471,7 +513,7 @@ void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( const Index cols = this->cols(); const Index nrhs = rhs.cols(); const Index rank = this->rank(); - Matrix temp((std::max)(cols, nrhs)); + Matrix temp((std::max)(cols, nrhs)); for (Index k = 0; k < rank; ++k) { if (k != rank - 1) { rhs.row(k).swap(rhs.row(rank - 1)); @@ -491,8 +533,6 @@ template template void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( const RhsType& rhs, DstType& dst) const { - eigen_assert(rhs.rows() == this->rows()); - const Index rank = this->rank(); if (rank == 0) { dst.setZero(); @@ -500,11 +540,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( } // Compute c = Q^* * rhs - // Note that the matrix Q = H_0^* H_1^*... so its inverse is - // Q^* = (H_0 H_1 ...)^T typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft( - householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); + c.applyOnTheLeft(matrixQ().setLength(rank).adjoint()); // Solve T z = c(1:rank, :) dst.topRows(rank) = matrixT() @@ -523,10 +560,45 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( // Undo permutation to get x = P^{-1} * y. dst = colsPermutation() * dst; } + +template +template +void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = this->rank(); + + if (rank == 0) { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(colsPermutation().transpose()*rhs); + + if (rank < cols()) { + applyZOnTheLeftInPlace(c); + } + + matrixT().topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif namespace internal { +template +struct traits > > + : traits::PlainObject> +{ + enum { Flags = 0 }; +}; + template struct Assignment >, internal::assign_op::Scalar>, Dense2Dense> { @@ -534,7 +606,8 @@ struct Assignment SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { - dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows())); + typedef Matrix IdentityMatrixType; + dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols())); } }; diff --git a/inst/include/Eigen/src/QR/FullPivHouseholderQR.h b/inst/include/Eigen/src/QR/FullPivHouseholderQR.h index e489bddc..d0664a1d 100644 --- a/inst/include/Eigen/src/QR/FullPivHouseholderQR.h +++ b/inst/include/Eigen/src/QR/FullPivHouseholderQR.h @@ -18,6 +18,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -55,20 +58,19 @@ struct traits > * \sa MatrixBase::fullPivHouseholderQr() */ template class FullPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef internal::FullPivHouseholderQRMatrixQReturnType MatrixQReturnType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef Matrix class FullPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * \c *this is the QR decomposition. * @@ -173,11 +176,8 @@ template class FullPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns Expression object representing the matrix Q */ @@ -392,22 +392,24 @@ template class FullPivHouseholderQR * diagonal coefficient of U. */ RealScalar maxPivot() const { return m_maxpivot; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; @@ -499,15 +501,15 @@ void FullPivHouseholderQR::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; i++) { - m_rows_transpositions.coeffRef(i) = i; - m_cols_transpositions.coeffRef(i) = i; + m_rows_transpositions.coeffRef(i) = internal::convert_index(i); + m_cols_transpositions.coeffRef(i) = internal::convert_index(i); m_hCoeffs.coeffRef(i) = Scalar(0); } break; } - m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner; - m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner; + m_rows_transpositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_cols_transpositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k)); ++number_of_transpositions; @@ -541,7 +543,6 @@ template template void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); const Index l_rank = rank(); // FIXME introduce nonzeroPivots() and use it here. and more generally, @@ -554,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType typename RhsType::PlainObject c(rhs); - Matrix temp(rhs.cols()); + Matrix temp(rhs.cols()); for (Index k = 0; k < l_rank; ++k) { Index remainingSize = rows()-k; @@ -571,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i); for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero(); } + +template +template +void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index l_rank = rank(); + + if(l_rank == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs); + + m_qr.topLeftCorner(l_rank, l_rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(l_rank)); + + dst.topRows(l_rank) = c.topRows(l_rank); + dst.bottomRows(rows()-l_rank).setZero(); + + Matrix temp(dst.cols()); + const Index size = (std::min)(rows(), cols()); + for (Index k = size-1; k >= 0; --k) + { + Index remainingSize = rows()-k; + + dst.bottomRightCorner(remainingSize, dst.cols()) + .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf(), + m_hCoeffs.template conjugateIf().coeff(k), &temp.coeffRef(0)); + + dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k))); + } +} #endif namespace internal { diff --git a/inst/include/Eigen/src/QR/HouseholderQR.h b/inst/include/Eigen/src/QR/HouseholderQR.h index 3513d995..801739fb 100644 --- a/inst/include/Eigen/src/QR/HouseholderQR.h +++ b/inst/include/Eigen/src/QR/HouseholderQR.h @@ -14,6 +14,18 @@ namespace Eigen { +namespace internal { +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + +} // end namespace internal + /** \ingroup QR_Module * * @@ -42,20 +54,19 @@ namespace Eigen { * \sa MatrixBase::householderQr() */ template class HouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix MatrixQType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef typename internal::plain_row_type::type RowVectorType; @@ -121,6 +132,7 @@ template class HouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -137,11 +149,8 @@ template class HouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations. * @@ -204,28 +213,30 @@ template class HouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; @@ -292,7 +303,7 @@ template struct householder_qr_inplace_blocked { - // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h + // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32, typename MatrixQR::Scalar* tempData = 0) { @@ -350,15 +361,10 @@ template void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); - eigen_assert(rhs.rows() == rows()); typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence( - m_qr.leftCols(rank), - m_hCoeffs.head(rank)).transpose() - ); + c.applyOnTheLeft(householderQ().setLength(rank).adjoint() ); m_qr.topLeftCorner(rank, rank) .template triangularView() @@ -367,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c dst.topRows(rank) = c.topRows(rank); dst.bottomRows(cols()-rank).setZero(); } + +template +template +void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = (std::min)(rows(), cols()); + + typename RhsType::PlainObject c(rhs); + + m_qr.topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif /** Performs the QR factorization of the given matrix \a matrix. The result of diff --git a/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 953d57c9..013c7ae7 100644 --- a/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -74,13 +74,35 @@ class SPQR : public SparseSolverBase > }; public: SPQR() - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) + : m_analysisIsOk(false), + m_factorizationIsOk(false), + m_isRUpToDate(false), + m_ordering(SPQR_ORDERING_DEFAULT), + m_allow_tol(SPQR_DEFAULT_TOL), + m_tolerance (NumTraits::epsilon()), + m_cR(0), + m_E(0), + m_H(0), + m_HPinv(0), + m_HTau(0), + m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) + : m_analysisIsOk(false), + m_factorizationIsOk(false), + m_isRUpToDate(false), + m_ordering(SPQR_ORDERING_DEFAULT), + m_allow_tol(SPQR_DEFAULT_TOL), + m_tolerance (NumTraits::epsilon()), + m_cR(0), + m_E(0), + m_H(0), + m_HPinv(0), + m_HTau(0), + m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); @@ -220,7 +242,7 @@ class SPQR : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the sparse QR can not be computed */ ComputationInfo info() const diff --git a/inst/include/Eigen/src/SVD/BDCSVD.h b/inst/include/Eigen/src/SVD/BDCSVD.h index a5b73f8f..17f8e443 100644 --- a/inst/include/Eigen/src/SVD/BDCSVD.h +++ b/inst/include/Eigen/src/SVD/BDCSVD.h @@ -22,6 +22,11 @@ // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS +#ifdef EIGEN_BDCSVD_SANITY_CHECKS +#undef eigen_internal_assert +#define eigen_internal_assert(X) assert(X); +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -34,6 +39,7 @@ namespace internal { template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -57,7 +63,7 @@ struct traits > * recommended and can several order of magnitude faster. * * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. - * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will * significantly degrade the accuracy. * @@ -105,7 +111,7 @@ class BDCSVD : public SVDBase > * The default constructor is useful in cases in which the user intends to * perform decompositions via BDCSVD::compute(const MatrixType&). */ - BDCSVD() : m_algoswap(16), m_numIters(0) + BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {} @@ -202,6 +208,7 @@ class BDCSVD : public SVDBase > using Base::m_computeThinV; using Base::m_matrixU; using Base::m_matrixV; + using Base::m_info; using Base::m_isInitialized; using Base::m_nonzeroSingularValues; @@ -212,7 +219,7 @@ class BDCSVD : public SVDBase > // Method to allocate and initialize matrix and attributes template -void BDCSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void BDCSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { m_isTranspose = (cols > rows); @@ -250,16 +257,25 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign { // FIXME this line involves temporaries JacobiSVD jsvd(matrix,computationOptions); - if(computeU()) m_matrixU = jsvd.matrixU(); - if(computeV()) m_matrixV = jsvd.matrixV(); - m_singularValues = jsvd.singularValues(); - m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); m_isInitialized = true; + m_info = jsvd.info(); + if (m_info == Success || m_info == NoConvergence) { + if(computeU()) m_matrixU = jsvd.matrixU(); + if(computeV()) m_matrixV = jsvd.matrixV(); + m_singularValues = jsvd.singularValues(); + m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); + } return *this; } //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows - RealScalar scale = matrix.cwiseAbs().maxCoeff(); + RealScalar scale = matrix.cwiseAbs().template maxCoeff(); + if (!(numext::isfinite)(scale)) { + m_isInitialized = true; + m_info = InvalidInput; + return *this; + } + if(scale==Literal(0)) scale = Literal(1); MatrixX copy; if (m_isTranspose) copy = matrix.adjoint()/scale; @@ -276,7 +292,11 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose(); m_computed.template bottomRows<1>().setZero(); divide(0, m_diagSize - 1, 0, 0, 0); - + if (m_info != Success && m_info != NoConvergence) { + m_isInitialized = true; + return *this; + } + //**** step 3 - Copy singular values and vectors for (int i=0; i::structured_update(Block A, co //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template -void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { // requires rows = cols + 1; using std::pow; @@ -408,6 +428,8 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, { // FIXME this line involves temporaries JacobiSVD b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)); + m_info = b.info(); + if (m_info != Success && m_info != NoConvergence) return; if (m_compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU(); else @@ -427,7 +449,9 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the // right submatrix before the left one. divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift); + if (m_info != Success && m_info != NoConvergence) return; divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1); + if (m_info != Success && m_info != NoConvergence) return; if (m_compU) { @@ -568,7 +592,7 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template -void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) +void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits::min)(); using std::abs; @@ -591,7 +615,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n; + while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); } Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -618,13 +642,11 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec std::cout << " shift: " << shifts.transpose() << "\n"; { - Index actual_n = n; - while(actual_n>1 && abs(col0(actual_n-1))= 0).all()); std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; - std::cout << " check3 (>0) : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n"; - std::cout << " check4 (>0) : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n"; + assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); } #endif @@ -652,13 +674,13 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(U.allFinite()); - assert(V.allFinite()); - assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n); - assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n); assert(m_naiveU.allFinite()); assert(m_naiveV.allFinite()); assert(m_computed.allFinite()); + assert(U.allFinite()); + assert(V.allFinite()); +// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); +// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); #endif // Because of deflation, the singular values might not be completely sorted. @@ -673,6 +695,15 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec if(m_compV) V.col(i).swap(V.col(i+1)); } } + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + { + bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all(); + if(!singular_values_sorted) + std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n"; + assert(singular_values_sorted); + } +#endif // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end @@ -749,19 +780,22 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar mid = left + (right-left) / Literal(2); RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0)); #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << right-left << "\n"; - std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right) << "\n"; - std::cout << " = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.2*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.3*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.4*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.49*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.5*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.51*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.6*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.7*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.8*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n"; + std::cout << "right-left = " << right-left << "\n"; +// std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left) +// << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right) << "\n"; + std::cout << " = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.1) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.2) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.3) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.4) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.49) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.5) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.51) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.6) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.7) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.8) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.9) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n"; #endif RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; @@ -819,13 +853,16 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // And find mu such that f(mu)==0: RealScalar muZero = -a/b; RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift); + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((numext::isfinite)(fZero)); +#endif muPrev = muCur; fPrev = fCur; muCur = muZero; fCur = fZero; - if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; @@ -858,20 +895,33 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d else rightShifted = -(std::numeric_limits::min)(); } - + RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); eigen_internal_assert(fLeft " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; + std::cout << "f(leftShifted) using leftShifted=" << leftShifted << " ; diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; " + << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n"; + std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " + << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift + << " , f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift) + << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n"; } #endif eigen_internal_assert(fLeft * fRight < Literal(0)); @@ -912,6 +962,15 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d shifts[k] = shift; mus[k] = muCur; +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE + if(k+1=singVals[k-1]); + assert(singVals[k]>=diag(k)); +#endif + // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - @@ -935,7 +994,7 @@ void BDCSVD::perturbCol0 zhat.setZero(); return; } - Index last = perm(m-1); + Index lastIdx = perm(m-1); // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { @@ -945,15 +1004,43 @@ void BDCSVD::perturbCol0 { // see equation (3.6) RealScalar dk = diag(k); - RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk)); + RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk)); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + if(prod<0) { + std::cout << "k = " << k << " ; z(k)=" << col0(k) << ", diag(k)=" << dk << "\n"; + std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n"; + std::cout << " = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n"; + } + assert(prod>=0); +#endif for(Index l = 0; l=k && (l==0 || l-1>=m)) + { + std::cout << "Error in perturbCol0\n"; + std::cout << " " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " " << "\n"; + std::cout << " " <=0); +#endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) @@ -962,9 +1049,12 @@ void BDCSVD::perturbCol0 } } #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; + std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((numext::isfinite)(tmp)); +#endif zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } @@ -1018,7 +1108,7 @@ void BDCSVD::computeSingVecs // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M template -void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index size) +void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1047,7 +1137,7 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested template -void BDCSVD::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size) +void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1074,7 +1164,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi } c/=r; s/=r; - m_computed(firstColm + i, firstColm) = r; + m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); m_computed(firstColm + j, firstColm) = Literal(0); @@ -1087,7 +1177,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] template -void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { using std::sqrt; using std::abs; @@ -1148,6 +1238,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "to be sorted: " << diag.transpose() << "\n\n"; + std::cout << " : " << col0.transpose() << "\n\n"; #endif { // Check for total deflation @@ -1238,7 +1329,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index if( (diag(i) - diag(i-1)) < NumTraits::epsilon()*maxDiag ) { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*diag(i) << "\n"; + std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*/*diag(i)*/maxDiag << "\n"; #endif eigen_internal_assert(abs(diag(i) - diag(i-1))::deflation(Index firstCol, Index lastCol, Index k, Index #endif }//end deflation -#ifndef __CUDACC__ /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm @@ -1270,7 +1360,6 @@ MatrixBase::bdcSvd(unsigned int computationOptions) const { return BDCSVD(*this, computationOptions); } -#endif } // end namespace Eigen diff --git a/inst/include/Eigen/src/SVD/JacobiSVD.h b/inst/include/Eigen/src/SVD/JacobiSVD.h index 43488b1e..9d95acdf 100644 --- a/inst/include/Eigen/src/SVD/JacobiSVD.h +++ b/inst/include/Eigen/src/SVD/JacobiSVD.h @@ -112,12 +112,12 @@ class qr_preconditioner_impl - TransposeTypeWithSameStorageOrder; + + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -202,13 +202,12 @@ class qr_preconditioner_impl - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -303,8 +302,9 @@ class qr_preconditioner_impl - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -584,6 +585,7 @@ template class JacobiSVD using Base::m_matrixU; using Base::m_matrixV; using Base::m_singularValues; + using Base::m_info; using Base::m_isInitialized; using Base::m_isAllocated; using Base::m_usePrescribedThreshold; @@ -610,7 +612,7 @@ template class JacobiSVD }; template -void JacobiSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void JacobiSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0); @@ -624,6 +626,7 @@ void JacobiSVD::allocate(Index rows, Index cols, u m_rows = rows; m_cols = cols; + m_info = Success; m_isInitialized = false; m_isAllocated = true; m_computationOptions = computationOptions; @@ -673,7 +676,12 @@ JacobiSVD::compute(const MatrixType& matrix, unsig const RealScalar considerAsZero = (std::numeric_limits::min)(); // Scaling factor to reduce over/under-flows - RealScalar scale = matrix.cwiseAbs().maxCoeff(); + RealScalar scale = matrix.cwiseAbs().template maxCoeff(); + if (!(numext::isfinite)(scale)) { + m_isInitialized = true; + m_info = InvalidInput; + return *this; + } if(scale==RealScalar(0)) scale = RealScalar(1); /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ diff --git a/inst/include/Eigen/src/SVD/SVDBase.h b/inst/include/Eigen/src/SVD/SVDBase.h index 53da2848..bc7ab88b 100644 --- a/inst/include/Eigen/src/SVD/SVDBase.h +++ b/inst/include/Eigen/src/SVD/SVDBase.h @@ -17,6 +17,18 @@ #define EIGEN_SVDBASE_H namespace Eigen { + +namespace internal { +template struct traits > + : traits +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; +} + /** \ingroup SVD_Module * * @@ -39,20 +51,26 @@ namespace Eigen { * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving. + * + * The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not + * considered well defined. * - * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to + * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to * terminate in finite (and reasonable) time. * \sa class BDCSVD, class JacobiSVD */ -template -class SVDBase +template class SVDBase + : public SolverBase > { +public: + + template + friend struct internal::solve_assertion; -public: typedef typename internal::traits::MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; + typedef typename Eigen::internal::traits::StorageIndex StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -82,7 +100,7 @@ class SVDBase */ const MatrixUType& matrixU() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?"); return m_matrixU; } @@ -98,7 +116,7 @@ class SVDBase */ const MatrixVType& matrixV() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?"); return m_matrixV; } @@ -110,14 +128,14 @@ class SVDBase */ const SingularValuesType& singularValues() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); return m_singularValues; } /** \returns the number of singular values that are not exactly 0 */ Index nonzeroSingularValues() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); return m_nonzeroSingularValues; } @@ -130,7 +148,7 @@ class SVDBase inline Index rank() const { using std::abs; - eigen_assert(m_isInitialized && "JacobiSVD is not initialized."); + _check_compute_assertions(); if(m_singularValues.size()==0) return 0; RealScalar premultiplied_threshold = numext::maxi(m_singularValues.coeff(0) * threshold(), (std::numeric_limits::min)()); Index i = m_nonzeroSingularValues-1; @@ -194,6 +212,7 @@ class SVDBase inline Index rows() const { return m_rows; } inline Index cols() const { return m_cols; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A. * * \param b the right-hand-side of the equation to solve. @@ -205,32 +224,55 @@ class SVDBase */ template inline const Solve - solve(const MatrixBase& b) const + solve(const MatrixBase& b) const; + #endif + + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was successful. + */ + EIGEN_DEVICE_FUNC + ComputationInfo info() const { eigen_assert(m_isInitialized && "SVD is not initialized."); - eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice)."); - return Solve(derived(), b.derived()); + return m_info; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + + void _check_compute_assertions() const { + eigen_assert(m_isInitialized && "SVD is not initialized."); + } + + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + _check_compute_assertions(); + eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice)."); + eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b"); + } + // return true if already allocated bool allocate(Index rows, Index cols, unsigned int computationOptions) ; MatrixUType m_matrixU; MatrixVType m_matrixV; SingularValuesType m_singularValues; + ComputationInfo m_info; bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold; bool m_computeFullU, m_computeThinU; bool m_computeFullV, m_computeThinV; @@ -243,9 +285,14 @@ class SVDBase * Default constructor of SVDBase */ SVDBase() - : m_isInitialized(false), + : m_info(Success), + m_isInitialized(false), m_isAllocated(false), m_usePrescribedThreshold(false), + m_computeFullU(false), + m_computeThinU(false), + m_computeFullV(false), + m_computeThinV(false), m_computationOptions(0), m_rows(-1), m_cols(-1), m_diagSize(0) { @@ -260,17 +307,30 @@ template template void SVDBase::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - // A = U S V^* // So A^{-1} = V S^{-1} U^* - Matrix tmp; + Matrix tmp; Index l_rank = rank(); tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs; tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; dst = m_matrixV.leftCols(l_rank) * tmp; } + +template +template +void SVDBase::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + // A = U S V^* + // So A^{-*} = U S^{-1} V^* + // And A^{-T} = U_conj S^{-1} V^T + Matrix tmp; + Index l_rank = rank(); + + tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf() * rhs; + tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; + dst = m_matrixU.template conjugateIf().leftCols(l_rank) * tmp; +} #endif template @@ -288,6 +348,7 @@ bool SVDBase::allocate(Index rows, Index cols, unsigned int computat m_rows = rows; m_cols = cols; + m_info = Success; m_isInitialized = false; m_isAllocated = true; m_computationOptions = computationOptions; diff --git a/inst/include/Eigen/src/SVD/UpperBidiagonalization.h b/inst/include/Eigen/src/SVD/UpperBidiagonalization.h index 11ac847e..997defc4 100644 --- a/inst/include/Eigen/src/SVD/UpperBidiagonalization.h +++ b/inst/include/Eigen/src/SVD/UpperBidiagonalization.h @@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat, .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]); // apply householder transform to remaining part of mat on the left mat.bottomRightCorner(remainingRows-1, remainingCols) - .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData); + .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData); } } @@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType y_k( Y.col(k).tail(remainingCols) ); - // let's use the begining of column k of Y as a temporary vector + // let's use the beginning of column k of Y as a temporary vector SubColumnType tmp( Y.col(k).head(k) ); y_k.noalias() = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck tmp.noalias() = V_k1.adjoint() * v_k; @@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType x_k ( X.col(k).tail(remainingRows-1) ); - // let's use the begining of column k of X as a temporary vectors + // let's use the beginning of column k of X as a temporary vectors // note that tmp0 and tmp1 overlaps SubColumnType tmp0 ( X.col(k).head(k) ), tmp1 ( X.col(k).head(k+1) ); diff --git a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h index 369e6804..9f93e325 100644 --- a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -80,11 +80,19 @@ class SimplicialCholeskyBase : public SparseSolverBase /** Default constructor */ SimplicialCholeskyBase() - : m_info(Success), m_shiftOffset(0), m_shiftScale(1) + : m_info(Success), + m_factorizationIsOk(false), + m_analysisIsOk(false), + m_shiftOffset(0), + m_shiftScale(1) {} explicit SimplicialCholeskyBase(const MatrixType& matrix) - : m_info(Success), m_shiftOffset(0), m_shiftScale(1) + : m_info(Success), + m_factorizationIsOk(false), + m_analysisIsOk(false), + m_shiftOffset(0), + m_shiftScale(1) { derived().compute(matrix); } @@ -101,7 +109,7 @@ class SimplicialCholeskyBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -210,7 +218,7 @@ class SimplicialCholeskyBase : public SparseSolverBase CholMatrixType tmp(size,size); ConstCholMatrixPtr pmat; - if(m_P.size()==0 && (UpLo&Upper)==Upper) + if(m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper) { // If there is no ordering, try to directly use the input matrix without any copy internal::simplicial_cholesky_grab_input::run(a, pmat, tmp); @@ -279,8 +287,8 @@ template struct traits CholMatrixType; typedef TriangularView MatrixL; typedef TriangularView MatrixU; - static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); } - static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); } + static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); } + static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); } }; template struct traits > @@ -293,8 +301,8 @@ template struct traits CholMatrixType; typedef TriangularView MatrixL; typedef TriangularView MatrixU; - static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); } - static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); } + static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); } + static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); } }; template struct traits > diff --git a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 7b6183d0..72e1740c 100644 --- a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -2,46 +2,21 @@ // for linear algebra. // // Copyright (C) 2008-2012 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* - -NOTE: thes functions vave been adapted from the LDL library: +NOTE: these functions have been adapted from the LDL library: LDL Copyright (c) 2005 by Timothy A. Davis. All Rights Reserved. -LDL License: - - Your use or distribution of LDL or any modified version of - LDL implies that you agree to this License. - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 - USA - - Permission is hereby granted to use or copy this program under the - terms of the GNU LGPL, provided that the Copyright, this License, - and the Availability of the original version is retained on all copies. - User documentation of any code that uses this code or any modified - version of this code must cite the Copyright, this License, the - Availability note, and "Used by permission." Permission to modify - the code and to distribute modified code is granted, provided the - Copyright, this License, and the Availability note are retained, - and a notice that the code was modified is included. +The author of LDL, Timothy A. Davis., has executed a license with Google LLC +to permit distribution of this code and derivative works as part of Eigen under +the Mozilla Public License v. 2.0, as stated at the top of this file. */ -#include "../Core/util/NonMPL2.h" - #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H @@ -122,7 +97,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& for(StorageIndex k = 0; k < size; ++k) { // compute nonzero pattern of kth row of L, in topological order - y[k] = 0.0; // Y(0:k) is now all zero + y[k] = Scalar(0); // Y(0:k) is now all zero StorageIndex top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L @@ -146,12 +121,12 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& /* compute numerical values kth row of L (a sparse triangular solve) */ RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset; // get D(k,k), apply the shift function, and clear Y(k) - y[k] = 0.0; + y[k] = Scalar(0); for(; top < size; ++top) { Index i = pattern[top]; /* pattern[top:n-1] is pattern of L(:,k) */ Scalar yi = y[i]; /* get and clear Y(i) */ - y[i] = 0.0; + y[i] = Scalar(0); /* the nonzero entry L(k,i) */ Scalar l_ki; diff --git a/inst/include/Eigen/src/SparseCore/CompressedStorage.h b/inst/include/Eigen/src/SparseCore/CompressedStorage.h index d89fa0da..acd986fa 100644 --- a/inst/include/Eigen/src/SparseCore/CompressedStorage.h +++ b/inst/include/Eigen/src/SparseCore/CompressedStorage.h @@ -207,6 +207,22 @@ class CompressedStorage return m_values[id]; } + void moveChunk(Index from, Index to, Index chunkSize) + { + eigen_internal_assert(to+chunkSize <= m_size); + if(to>from && from+chunkSize>to) + { + // move backward + internal::smart_memmove(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + else + { + internal::smart_copy(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + } + void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { Index k = 0; diff --git a/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 9db119b6..94865025 100644 --- a/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H #define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -25,16 +25,16 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r Index rows = lhs.innerSize(); Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); - + ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0); ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0); ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0); - + std::memset(mask,0,sizeof(bool)*rows); evaluator lhsEval(lhs); evaluator rhsEval(rhs); - + // estimate the number of non zero entries // given a rhs column containing Y non zeros, we assume that the respective Y columns // of the lhs differs in average of one non zeros, thus the number of non zeros for @@ -141,7 +141,7 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; typedef SparseMatrix ColMajorMatrixAux; typedef typename sparse_eval::type ColMajorMatrix; - + // If the result is tall and thin (in the extreme case a column vector) // then it is faster to sort the coefficients inplace instead of transposing twice. // FIXME, the following heuristic is probably not very good. @@ -155,7 +155,7 @@ struct conservative_sparse_sparse_product_selector(lhs, rhs, resCol, false); RowMajorMatrix resRow(resCol); res = resRow.markAsRValue(); diff --git a/inst/include/Eigen/src/SparseCore/SparseAssign.h b/inst/include/Eigen/src/SparseCore/SparseAssign.h index 18352a84..905485c8 100644 --- a/inst/include/Eigen/src/SparseCore/SparseAssign.h +++ b/inst/include/Eigen/src/SparseCore/SparseAssign.h @@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) // eval without temporary dst.resize(src.rows(), src.cols()); dst.setZero(); - dst.reserve((std::max)(src.rows(),src.cols())*2); + dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2)); for (Index j=0; j }; // Generic Sparse to Dense assignment -template< typename DstXprType, typename SrcXprType, typename Functor> -struct Assignment +template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> +struct Assignment { static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { @@ -153,6 +153,73 @@ struct Assignment } }; +// Specialization for dense ?= dense +/- sparse and dense ?= sparse +/- dense +template +struct assignment_from_dense_op_sparse +{ + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) + { + #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN + EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN + #endif + + call_assignment_no_alias(dst, src.lhs(), Func1()); + call_assignment_no_alias(dst, src.rhs(), Func2()); + } + + // Specialization for dense1 = sparse + dense2; -> dense1 = dense2; dense1 += sparse; + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::Shape,DenseShape>::value>::type + run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, + const internal::assign_op& /*func*/) + { + #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN + EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN + #endif + + // Apply the dense matrix first, then the sparse one. + call_assignment_no_alias(dst, src.rhs(), Func1()); + call_assignment_no_alias(dst, src.lhs(), Func2()); + } + + // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse; + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::Shape,DenseShape>::value>::type + run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, + const internal::assign_op& /*func*/) + { + #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN + EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN + #endif + + // Apply the dense matrix first, then the sparse one. + call_assignment_no_alias(dst, -src.rhs(), Func1()); + call_assignment_no_alias(dst, src.lhs(), add_assign_op()); + } +}; + +#define EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(ASSIGN_OP,BINOP,ASSIGN_OP2) \ + template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> \ + struct Assignment, const Lhs, const Rhs>, internal::ASSIGN_OP, \ + Sparse2Dense, \ + typename internal::enable_if< internal::is_same::Shape,DenseShape>::value \ + || internal::is_same::Shape,DenseShape>::value>::type> \ + : assignment_from_dense_op_sparse, internal::ASSIGN_OP2 > \ + {} + +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_sum_op,add_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_sum_op,add_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_sum_op,sub_assign_op); + +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_difference_op,sub_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_difference_op,sub_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_difference_op,add_assign_op); + + // Specialization for "dst = dec.solve(rhs)" // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error template @@ -179,35 +246,22 @@ struct Assignment { typedef typename DstXprType::StorageIndex StorageIndex; typedef typename DstXprType::Scalar Scalar; - typedef Array ArrayXI; - typedef Array ArrayXS; - template - static void run(SparseMatrix &dst, const SrcXprType &src, const internal::assign_op &/*func*/) - { - Index dstRows = src.rows(); - Index dstCols = src.cols(); - if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) - dst.resize(dstRows, dstCols); - Index size = src.diagonal().size(); - dst.makeCompressed(); - dst.resizeNonZeros(size); - Map(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1); - Map(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size)); - Map(dst.valuePtr(), size) = src.diagonal(); - } + template + static void run(SparseMatrix &dst, const SrcXprType &src, const AssignFunc &func) + { dst.assignDiagonal(src.diagonal(), func); } template static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::assign_op &/*func*/) - { - dst.diagonal() = src.diagonal(); - } + { dst.derived().diagonal() = src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) - { dst.diagonal() += src.diagonal(); } + template + static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) + { dst.derived().diagonal() += src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) - { dst.diagonal() -= src.diagonal(); } + template + static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) + { dst.derived().diagonal() -= src.diagonal(); } }; } // end namespace internal diff --git a/inst/include/Eigen/src/SparseCore/SparseBlock.h b/inst/include/Eigen/src/SparseCore/SparseBlock.h index 511e92b2..5b4f6cc9 100644 --- a/inst/include/Eigen/src/SparseCore/SparseBlock.h +++ b/inst/include/Eigen/src/SparseCore/SparseBlock.h @@ -164,7 +164,7 @@ class sparse_matrix_block_impl } else { - if(m_matrix.isCompressed()) + if(m_matrix.isCompressed() && nnz!=block_size) { // no need to realloc, simply copy the tail at its respective position and insert tmp matrix.data().resize(start + nnz + tail_size); @@ -326,46 +326,6 @@ class BlockImpl,BlockRows,B //---------- -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). - */ -template -typename SparseMatrixBase::InnerVectorReturnType SparseMatrixBase::innerVector(Index outer) -{ return InnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorReturnType SparseMatrixBase::innerVector(Index outer) const -{ return ConstInnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). - */ -template -typename SparseMatrixBase::InnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) const -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - /** Generic implementation of sparse Block expression. * Real-only. */ @@ -486,9 +446,13 @@ struct unary_evaluator, IteratorBa {} inline Index nonZerosEstimate() const { - Index nnz = m_block.nonZeros(); - if(nnz<0) - return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size(); + const Index nnz = m_block.nonZeros(); + if(nnz < 0) { + // Scale the non-zero estimate for the underlying expression linearly with block size. + // Return zero if the underlying block is empty. + const Index nested_sz = m_block.nestedExpression().size(); + return nested_sz == 0 ? 0 : m_argImpl.nonZerosEstimate() * m_block.size() / nested_sz; + } return nnz; } @@ -503,22 +467,25 @@ template class unary_evaluator, IteratorBased>::InnerVectorInnerIterator : public EvalIterator { - enum { IsRowMajor = unary_evaluator::IsRowMajor }; + // NOTE MSVC fails to compile if we don't explicitely "import" IsRowMajor from unary_evaluator + // because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786) + // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor + enum { XprIsRowMajor = unary_evaluator::IsRowMajor }; const XprType& m_block; Index m_end; public: EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer) - : EvalIterator(aEval.m_argImpl, outer + (IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())), + : EvalIterator(aEval.m_argImpl, outer + (XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())), m_block(aEval.m_block), - m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()) + m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()) { - while( (EvalIterator::operator bool()) && (EvalIterator::index() < (IsRowMajor ? m_block.startCol() : m_block.startRow())) ) + while( (EvalIterator::operator bool()) && (EvalIterator::index() < (XprIsRowMajor ? m_block.startCol() : m_block.startRow())) ) EvalIterator::operator++(); } - inline StorageIndex index() const { return EvalIterator::index() - convert_index(IsRowMajor ? m_block.startCol() : m_block.startRow()); } - inline Index outer() const { return EvalIterator::outer() - (IsRowMajor ? m_block.startRow() : m_block.startCol()); } + inline StorageIndex index() const { return EvalIterator::index() - convert_index(XprIsRowMajor ? m_block.startCol() : m_block.startRow()); } + inline Index outer() const { return EvalIterator::outer() - (XprIsRowMajor ? m_block.startRow() : m_block.startCol()); } inline Index row() const { return EvalIterator::row() - m_block.startRow(); } inline Index col() const { return EvalIterator::col() - m_block.startCol(); } @@ -528,7 +495,8 @@ class unary_evaluator, IteratorBas template class unary_evaluator, IteratorBased>::OuterVectorInnerIterator { - enum { IsRowMajor = unary_evaluator::IsRowMajor }; + // NOTE see above + enum { XprIsRowMajor = unary_evaluator::IsRowMajor }; const unary_evaluator& m_eval; Index m_outerPos; const Index m_innerIndex; @@ -538,9 +506,9 @@ class unary_evaluator, IteratorBas EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer) : m_eval(aEval), - m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ), - m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()), - m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()), + m_outerPos( (XprIsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ), + m_innerIndex(XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()), + m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()), m_it(m_eval.m_argImpl, m_outerPos) { EIGEN_UNUSED_VARIABLE(outer); @@ -551,10 +519,10 @@ class unary_evaluator, IteratorBas ++(*this); } - inline StorageIndex index() const { return convert_index(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); } + inline StorageIndex index() const { return convert_index(m_outerPos - (XprIsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); } inline Index outer() const { return 0; } - inline Index row() const { return IsRowMajor ? 0 : index(); } - inline Index col() const { return IsRowMajor ? index() : 0; } + inline Index row() const { return XprIsRowMajor ? 0 : index(); } + inline Index col() const { return XprIsRowMajor ? index() : 0; } inline Scalar value() const { return m_it.value(); } inline Scalar& valueRef() { return m_it.valueRef(); } diff --git a/inst/include/Eigen/src/SparseCore/SparseCompressedBase.h b/inst/include/Eigen/src/SparseCore/SparseCompressedBase.h index 5ccb4665..6a2c7a8c 100644 --- a/inst/include/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/inst/include/Eigen/src/SparseCore/SparseCompressedBase.h @@ -128,6 +128,28 @@ class SparseCompressedBase protected: /** Default constructor. Do nothing. */ SparseCompressedBase() {} + + /** \internal return the index of the coeff at (row,col) or just before if it does not exist. + * This is an analogue of std::lower_bound. + */ + internal::LowerBoundIndex lower_bound(Index row, Index col) const + { + eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = this->outerIndexPtr()[outer]; + Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer]; + eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + internal::LowerBoundIndex p; + p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr(); + p.found = (p.valueinnerIndexPtr()[p.value]==inner); + return p; + } + + friend struct internal::evaluator >; + private: template explicit SparseCompressedBase(const SparseCompressedBase&); }; @@ -185,6 +207,14 @@ class SparseCompressedBase::InnerIterator } inline InnerIterator& operator++() { m_id++; return *this; } + inline InnerIterator& operator+=(Index i) { m_id += i ; return *this; } + + inline InnerIterator operator+(Index i) + { + InnerIterator result = *this; + result += i; + return result; + } inline const Scalar& value() const { return m_values[m_id]; } inline Scalar& valueRef() { return const_cast(m_values[m_id]); } @@ -245,6 +275,14 @@ class SparseCompressedBase::ReverseInnerIterator } inline ReverseInnerIterator& operator--() { --m_id; return *this; } + inline ReverseInnerIterator& operator-=(Index i) { m_id -= i; return *this; } + + inline ReverseInnerIterator operator-(Index i) + { + ReverseInnerIterator result = *this; + result -= i; + return result; + } inline const Scalar& value() const { return m_values[m_id-1]; } inline Scalar& valueRef() { return const_cast(m_values[m_id-1]); } @@ -317,17 +355,8 @@ struct evaluator > Index find(Index row, Index col) const { - eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); - - const Index outer = Derived::IsRowMajor ? row : col; - const Index inner = Derived::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr(); - - return ((pinnerIndexPtr()[p]==inner)) ? p : Dynamic; + internal::LowerBoundIndex p = m_matrix->lower_bound(row,col); + return p.found ? p.value : Dynamic; } const Derived *m_matrix; diff --git a/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index e315e355..9b0d3f98 100644 --- a/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -101,7 +101,7 @@ struct binary_evaluator, IteratorBased, Iterat } else { - m_value = 0; // this is to avoid a compilation warning + m_value = Scalar(0); // this is to avoid a compilation warning m_id = -1; } return *this; @@ -126,7 +126,7 @@ struct binary_evaluator, IteratorBased, Iterat enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -211,9 +211,8 @@ struct binary_evaluator, IndexBased, IteratorB enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit binary_evaluator(const XprType& xpr) @@ -299,9 +298,8 @@ struct binary_evaluator, IteratorBased, IndexB enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit binary_evaluator(const XprType& xpr) @@ -459,7 +457,7 @@ struct sparse_conjunction_evaluator enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -532,9 +530,8 @@ struct sparse_conjunction_evaluator enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(RhsArg::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit sparse_conjunction_evaluator(const XprType& xpr) @@ -607,9 +604,8 @@ struct sparse_conjunction_evaluator enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(LhsArg::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit sparse_conjunction_evaluator(const XprType& xpr) diff --git a/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index df6c28d2..32dac0f7 100644 --- a/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -24,7 +24,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -79,7 +79,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; diff --git a/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h b/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h index 0547db59..f005a18a 100644 --- a/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h @@ -88,10 +88,11 @@ struct sparse_time_dense_product_impl::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { - evaluator lhsEval(lhs); + LhsEval lhsEval(lhs); for(Index c=0; c::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { - evaluator lhsEval(lhs); - for(Index j=0; j1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) { - typename Res::RowXpr res_j(res.row(j)); - for(LhsInnerIterator it(lhsEval,j); it ;++it) - res_j += (alpha*it.value()) * rhs.row(it.index()); + #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) + for(Index i=0; i diff --git a/inst/include/Eigen/src/SparseCore/SparseMatrix.h b/inst/include/Eigen/src/SparseCore/SparseMatrix.h index a5396538..616b4a0c 100644 --- a/inst/include/Eigen/src/SparseCore/SparseMatrix.h +++ b/inst/include/Eigen/src/SparseCore/SparseMatrix.h @@ -21,7 +21,7 @@ namespace Eigen { * This class implements a more versatile variants of the common \em compressed row/column storage format. * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index. * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra - * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero + * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero * can be done with limited memory reallocation and copies. * * A call to the function makeCompressed() turns the matrix into the standard \em compressed format @@ -99,6 +99,8 @@ class SparseMatrix typedef SparseCompressedBase Base; using Base::convert_index; friend class SparseVector<_Scalar,0,_StorageIndex>; + template + friend struct internal::Assignment; public: using Base::isCompressed; using Base::nonZeros; @@ -503,8 +505,8 @@ class SparseMatrix m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; } } - - /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */ + + /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { prune(default_prunning_func(reference,epsilon)); @@ -577,10 +579,12 @@ class SparseMatrix else if (innerChange < 0) { // Inner size decreased: allocate a new m_innerNonZeros - m_innerNonZeros = static_cast(std::malloc((m_outerSize+outerChange+1) * sizeof(StorageIndex))); + m_innerNonZeros = static_cast(std::malloc((m_outerSize + outerChange) * sizeof(StorageIndex))); if (!m_innerNonZeros) internal::throw_std_bad_alloc(); - for(Index i = 0; i < m_outerSize; i++) + for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++) m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; + for(Index i = m_outerSize; i < m_outerSize + outerChange; i++) + m_innerNonZeros[i] = 0; } // Change the m_innerNonZeros in case of a decrease of inner size @@ -605,9 +609,9 @@ class SparseMatrix m_outerIndex = newOuterIndex; if (outerChange > 0) { - StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; + StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; for(Index i=m_outerSize; i inline SparseMatrix& operator=(const EigenBase& other) { return Base::operator=(other.derived()); } + + template + inline SparseMatrix& operator=(const Product& other); #endif // EIGEN_PARSED_BY_DOXYGEN template @@ -896,6 +903,113 @@ class SparseMatrix m_data.index(p) = convert_index(inner); return (m_data.value(p) = Scalar(0)); } +protected: + struct IndexPosPair { + IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {} + Index i; + Index p; + }; + + /** \internal assign \a diagXpr to the diagonal of \c *this + * There are different strategies: + * 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression. + * 2 - otherwise, for each diagonal coeff, + * 2.a - if it already exists, then we update it, + * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion. + * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector. + * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements. + * + * TODO: some piece of code could be isolated and reused for a general in-place update strategy. + * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once), + * then it *might* be better to disable case 2.b since they will have to be copied anyway. + */ + template + void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) + { + Index n = diagXpr.size(); + + const bool overwrite = internal::is_same >::value; + if(overwrite) + { + if((this->rows()!=n) || (this->cols()!=n)) + this->resize(n, n); + } + + if(m_data.size()==0 || overwrite) + { + typedef Array ArrayXI; + this->makeCompressed(); + this->resizeNonZeros(n); + Eigen::Map(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1); + Eigen::Map(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n)); + Eigen::Map > values = this->coeffs(); + values.setZero(); + internal::call_assignment_no_alias(values, diagXpr, assignFunc); + } + else + { + bool isComp = isCompressed(); + internal::evaluator diaEval(diagXpr); + std::vector newEntries; + + // 1 - try in-place update and record insertion failures + for(Index i = 0; ilower_bound(i,i); + Index p = lb.value; + if(lb.found) + { + // the coeff already exists + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i])) + { + // non compressed mode with local room for inserting one element + m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p); + m_innerNonZeros[i]++; + m_data.value(p) = Scalar(0); + m_data.index(p) = StorageIndex(i); + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else + { + // defer insertion + newEntries.push_back(IndexPosPair(i,p)); + } + } + // 2 - insert deferred entries + Index n_entries = Index(newEntries.size()); + if(n_entries>0) + { + Storage newData(m_data.size()+n_entries); + Index prev_p = 0; + Index prev_i = 0; + for(Index k=0; k T; std::vector tripletList; - triplets.reserve(estimation_of_entries); + tripletList.reserve(estimation_of_entries); for(...) { // ... @@ -987,7 +1101,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa * * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather - * be explicitely stored into a std::vector for instance. + * be explicitly stored into a std::vector for instance. */ template template @@ -1233,7 +1347,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca } m_data.index(p) = convert_index(inner); - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } if(m_data.size() != m_data.allocatedSize()) diff --git a/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h b/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h index c6b548f1..229449f0 100644 --- a/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h @@ -87,6 +87,11 @@ template class SparseMatrixBase * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, + /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, + * and 2 for matrices. + */ + Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -350,18 +355,6 @@ template class SparseMatrixBase const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); } - // inner-vector - typedef Block InnerVectorReturnType; - typedef Block ConstInnerVectorReturnType; - InnerVectorReturnType innerVector(Index outer); - const ConstInnerVectorReturnType innerVector(Index outer) const; - - // set of inner-vectors - typedef Block InnerVectorsReturnType; - typedef Block ConstInnerVectorsReturnType; - InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize); - const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const; - DenseMatrixType toDense() const { return DenseMatrixType(derived()); diff --git a/inst/include/Eigen/src/SparseCore/SparseProduct.h b/inst/include/Eigen/src/SparseCore/SparseProduct.h index 4cbf6878..af8a7744 100644 --- a/inst/include/Eigen/src/SparseCore/SparseProduct.h +++ b/inst/include/Eigen/src/SparseCore/SparseProduct.h @@ -17,7 +17,7 @@ namespace Eigen { * The automatic pruning of the small values can be achieved by calling the pruned() function * in which case a totally different product algorithm is employed: * \code - * C = (A*B).pruned(); // supress numerical zeros (exact) + * C = (A*B).pruned(); // suppress numerical zeros (exact) * C = (A*B).pruned(ref); * C = (A*B).pruned(ref,epsilon); * \endcode @@ -164,6 +164,18 @@ struct unary_evaluator >, IteratorBased> } // end namespace internal +// sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.) +template +template +SparseMatrix& SparseMatrix::operator=(const Product& src) +{ + // std::cout << "in Assignment : " << DstOptions << "\n"; + SparseMatrix dst(src.rows(),src.cols()); + internal::generic_product_impl::evalTo(dst,src.lhs(),src.rhs()); + this->swap(dst); + return *this; +} + } // end namespace Eigen #endif // EIGEN_SPARSEPRODUCT_H diff --git a/inst/include/Eigen/src/SparseCore/SparseRef.h b/inst/include/Eigen/src/SparseCore/SparseRef.h index d91f38f9..748f87d6 100644 --- a/inst/include/Eigen/src/SparseCore/SparseRef.h +++ b/inst/include/Eigen/src/SparseCore/SparseRef.h @@ -201,7 +201,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); obj->~TPlainObjectType(); } } @@ -213,7 +213,7 @@ class Ref, Options, StrideType { if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed())) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); @@ -227,14 +227,14 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); } protected: - char m_object_bytes[sizeof(TPlainObjectType)]; + typename internal::aligned_storage::type m_storage; bool m_hasCopy; }; @@ -319,7 +319,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); obj->~TPlainObjectType(); } } @@ -335,14 +335,14 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); } protected: - char m_object_bytes[sizeof(TPlainObjectType)]; + typename internal::aligned_storage::type m_storage; bool m_hasCopy; }; diff --git a/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h b/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h index 76117a01..85b00e10 100644 --- a/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -142,6 +142,9 @@ template class SparseSelfAdjointView return *this = src.twistedBy(pnull); } + // Since we override the copy-assignment operator, we need to explicitly re-declare the copy-constructor + EIGEN_DEFAULT_COPY_CONSTRUCTOR(SparseSelfAdjointView) + template SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src) { diff --git a/inst/include/Eigen/src/SparseCore/SparseUtil.h b/inst/include/Eigen/src/SparseCore/SparseUtil.h index 74df0d49..ceb93688 100644 --- a/inst/include/Eigen/src/SparseCore/SparseUtil.h +++ b/inst/include/Eigen/src/SparseCore/SparseUtil.h @@ -140,6 +140,14 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS template<> struct glue_shapes { typedef SparseSelfAdjointShape type; }; template<> struct glue_shapes { typedef SparseTriangularShape type; }; +// return type of SparseCompressedBase::lower_bound; +struct LowerBoundIndex { + LowerBoundIndex() : value(-1), found(false) {} + LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {} + Index value; + bool found; +}; + } // end namespace internal /** \ingroup SparseCore_Module diff --git a/inst/include/Eigen/src/SparseCore/SparseVector.h b/inst/include/Eigen/src/SparseCore/SparseVector.h index 19b0fbc9..05779be6 100644 --- a/inst/include/Eigen/src/SparseCore/SparseVector.h +++ b/inst/include/Eigen/src/SparseCore/SparseVector.h @@ -281,7 +281,7 @@ class SparseVector } /** Swaps the values of \c *this and \a other. - * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only. + * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only. * \sa SparseMatrixBase::swap() */ inline void swap(SparseVector& other) diff --git a/inst/include/Eigen/src/SparseLU/SparseLU.h b/inst/include/Eigen/src/SparseLU/SparseLU.h index 87f0efe3..0c8d8939 100644 --- a/inst/include/Eigen/src/SparseLU/SparseLU.h +++ b/inst/include/Eigen/src/SparseLU/SparseLU.h @@ -18,6 +18,63 @@ template struct SparseLUMatrixLReturnType; template struct SparseLUMatrixUReturnType; +template +class SparseLUTransposeView : public SparseSolverBase > +{ +protected: + typedef SparseSolverBase > APIBase; + using APIBase::m_isInitialized; +public: + typedef typename SparseLUType::Scalar Scalar; + typedef typename SparseLUType::StorageIndex StorageIndex; + typedef typename SparseLUType::MatrixType MatrixType; + typedef typename SparseLUType::OrderingType OrderingType; + + enum { + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + + SparseLUTransposeView() : m_sparseLU(NULL) {} + SparseLUTransposeView(const SparseLUTransposeView& view) { + this->m_sparseLU = view.m_sparseLU; + } + void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;} + void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;} + using APIBase::_solve_impl; + template + bool _solve_impl(const MatrixBase &B, MatrixBase &X_base) const + { + Dest& X(X_base.derived()); + eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first"); + EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + + + // this ugly const_cast_derived() helps to detect aliasing when applying the permutations + for(Index j = 0; j < B.cols(); ++j){ + X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j); + } + //Forward substitution with transposed or adjoint of U + m_sparseLU->matrixU().template solveTransposedInPlace(X); + + //Backward substitution with transposed or adjoint of L + m_sparseLU->matrixL().template solveTransposedInPlace(X); + + // Permute back the solution + for (Index j = 0; j < B.cols(); ++j) + X.col(j) = m_sparseLU->rowsPermutation().transpose() * X.col(j); + return true; + } + inline Index rows() const { return m_sparseLU->rows(); } + inline Index cols() const { return m_sparseLU->cols(); } + +private: + SparseLUType *m_sparseLU; + SparseLUTransposeView& operator=(const SparseLUTransposeView&); +}; + + /** \ingroup SparseLU_Module * \class SparseLU * @@ -26,7 +83,7 @@ template struct SparseLUMatrixURetu * This class implements the supernodal LU factorization for general matrices. * It uses the main techniques from the sequential SuperLU package * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real - * and complex arithmetics with single and double precision, depending on the + * and complex arithmetic with single and double precision, depending on the * scalar type of your input matrix. * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. * It benefits directly from the built-in high-performant Eigen BLAS routines. @@ -97,6 +154,7 @@ class SparseLU : public SparseSolverBase >, }; public: + SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1) { initperfvalues(); @@ -128,6 +186,45 @@ class SparseLU : public SparseSolverBase >, //Factorize factorize(matrix); } + + /** \returns an expression of the transposed of the factored matrix. + * + * A typical usage is to solve for the transposed problem A^T x = b: + * \code + * solver.compute(A); + * x = solver.transpose().solve(b); + * \endcode + * + * \sa adjoint(), solve() + */ + const SparseLUTransposeView > transpose() + { + SparseLUTransposeView > transposeView; + transposeView.setSparseLU(this); + transposeView.setIsInitialized(this->m_isInitialized); + return transposeView; + } + + + /** \returns an expression of the adjoint of the factored matrix + * + * A typical usage is to solve for the adjoint problem A' x = b: + * \code + * solver.compute(A); + * x = solver.adjoint().solve(b); + * \endcode + * + * For real scalar types, this function is equivalent to transpose(). + * + * \sa transpose(), solve() + */ + const SparseLUTransposeView > adjoint() + { + SparseLUTransposeView > adjointView; + adjointView.setSparseLU(this); + adjointView.setIsInitialized(this->m_isInitialized); + return adjointView; + } inline Index rows() const { return m_mat.rows(); } inline Index cols() const { return m_mat.cols(); } @@ -193,7 +290,7 @@ class SparseLU : public SparseSolverBase >, /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance * \c InvalidInput if the input matrix is invalid * @@ -355,6 +452,9 @@ class SparseLU : public SparseSolverBase >, return (m_detPermR * m_detPermC) > 0 ? det : -det; } + Index nnzL() const { return m_nnzL; }; + Index nnzU() const { return m_nnzU; }; + protected: // Functions void initperfvalues() @@ -391,7 +491,6 @@ class SparseLU : public SparseSolverBase >, private: // Disable copy constructor SparseLU (const SparseLU& ); - }; // End class SparseLU @@ -501,7 +600,6 @@ void SparseLU::factorize(const MatrixType& matrix) m_isInitialized = true; - // Apply the column permutation computed in analyzepattern() // m_mat = matrix * m_perm_c.inverse(); m_mat = matrix; @@ -585,7 +683,6 @@ void SparseLU::factorize(const MatrixType& matrix) // (a) a relaxed supernode at the bottom of the etree, or // (b) panel_size contiguous columns, defined by the user Index jcol; - IndexVector panel_histo(n); Index pivrow; // Pivotal row number in the original row matrix Index nseg1; // Number of segments in U-column above panel row jcol Index nseg; // Number of segments in each U-column @@ -704,13 +801,19 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator typedef typename MappedSupernodalType::Scalar Scalar; explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL) { } - Index rows() { return m_mapL.rows(); } - Index cols() { return m_mapL.cols(); } + Index rows() const { return m_mapL.rows(); } + Index cols() const { return m_mapL.cols(); } template void solveInPlace( MatrixBase &X) const { m_mapL.solveInPlace(X); } + template + void solveTransposedInPlace( MatrixBase &X) const + { + m_mapL.template solveTransposedInPlace(X); + } + const MappedSupernodalType& m_mapL; }; @@ -721,8 +824,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU) : m_mapL(mapL),m_mapU(mapU) { } - Index rows() { return m_mapL.rows(); } - Index cols() { return m_mapL.cols(); } + Index rows() const { return m_mapL.rows(); } + Index cols() const { return m_mapL.cols(); } template void solveInPlace(MatrixBase &X) const { @@ -745,8 +848,9 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator } else { + // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); U = A.template triangularView().solve(U); } @@ -764,6 +868,52 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator } } // End For U-solve } + + template void solveTransposedInPlace(MatrixBase &X) const + { + using numext::conj; + Index nrhs = X.cols(); + Index n = X.rows(); + // Forward solve with U + for (Index k = 0; k <= m_mapL.nsuper(); k++) + { + Index fsupc = m_mapL.supToCol()[k]; + Index lda = m_mapL.colIndexPtr()[fsupc+1] - m_mapL.colIndexPtr()[fsupc]; // leading dimension + Index nsupc = m_mapL.supToCol()[k+1] - fsupc; + Index luptr = m_mapL.colIndexPtr()[fsupc]; + + for (Index j = 0; j < nrhs; ++j) + { + for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++) + { + typename MatrixUType::InnerIterator it(m_mapU, jcol); + for ( ; it; ++it) + { + Index irow = it.index(); + X(jcol, j) -= X(irow, j) * (Conjugate? conj(it.value()): it.value()); + } + } + } + if (nsupc == 1) + { + for (Index j = 0; j < nrhs; j++) + { + X(fsupc, j) /= (Conjugate? conj(m_mapL.valuePtr()[luptr]) : m_mapL.valuePtr()[luptr]); + } + } + else + { + Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); + Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + if(Conjugate) + U = A.adjoint().template triangularView().solve(U); + else + U = A.transpose().template triangularView().solve(U); + } + }// End For U-solve + } + + const MatrixLType& m_mapL; const MatrixUType& m_mapU; }; diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h b/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h index 4dc42e87..349bfd58 100644 --- a/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h +++ b/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h @@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w) /** - * Expand the existing storage to accomodate more fill-ins + * Expand the existing storage to accommodate more fill-ins * \param vec Valid pointer to the vector to allocate or expand * \param[in,out] length At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector * \param[in] nbElts Current number of elements in the factors diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 721e1883..0be293d1 100644 --- a/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -75,12 +75,12 @@ class MappedSuperNodalMatrix /** * Number of rows */ - Index rows() { return m_row; } + Index rows() const { return m_row; } /** * Number of columns */ - Index cols() { return m_col; } + Index cols() const { return m_col; } /** * Return the array of nonzero values packed by column @@ -156,6 +156,9 @@ class MappedSuperNodalMatrix class InnerIterator; template void solveInPlace( MatrixBase&X) const; + template + void solveTransposedInPlace( MatrixBase&X) const; + @@ -294,6 +297,77 @@ void MappedSuperNodalMatrix::solveInPlace( MatrixBase&X) co } } +template +template +void MappedSuperNodalMatrix::solveTransposedInPlace( MatrixBase&X) const +{ + using numext::conj; + Index n = int(X.rows()); + Index nrhs = Index(X.cols()); + const Scalar * Lval = valuePtr(); // Nonzero values + Matrix work(n, nrhs); // working vector + work.setZero(); + for (Index k = nsuper(); k >= 0; k--) + { + Index fsupc = supToCol()[k]; // First column of the current supernode + Index istart = rowIndexPtr()[fsupc]; // Pointer index to the subscript of the current column + Index nsupr = rowIndexPtr()[fsupc+1] - istart; // Number of rows in the current supernode + Index nsupc = supToCol()[k+1] - fsupc; // Number of columns in the current supernode + Index nrow = nsupr - nsupc; // Number of rows in the non-diagonal part of the supernode + Index irow; //Current index row + + if (nsupc == 1 ) + { + for (Index j = 0; j < nrhs; j++) + { + InnerIterator it(*this, fsupc); + ++it; // Skip the diagonal element + for (; it; ++it) + { + irow = it.row(); + X(fsupc,j) -= X(irow, j) * (Conjugate?conj(it.value()):it.value()); + } + } + } + else + { + // The supernode has more than one column + Index luptr = colIndexPtr()[fsupc]; + Index lda = colIndexPtr()[fsupc+1] - luptr; + + //Begin Gather + for (Index j = 0; j < nrhs; j++) + { + Index iptr = istart + nsupc; + for (Index i = 0; i < nrow; i++) + { + irow = rowIndex()[iptr]; + work.topRows(nrow)(i,j)= X(irow,j); // Gather operation + iptr++; + } + } + + // Matrix-vector product with transposed submatrix + Map, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); + Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + if(Conjugate) + U = U - A.adjoint() * work.topRows(nrow); + else + U = U - A.transpose() * work.topRows(nrow); + + // Triangular solve (of transposed diagonal block) + new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); + if(Conjugate) + U = A.adjoint().template triangularView().solve(U); + else + U = A.transpose().template triangularView().solve(U); + + } + + } +} + + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h b/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h index c98b30e3..5a2c941b 100644 --- a/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h +++ b/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h @@ -151,7 +151,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j StorageIndex ito = glu.xlsub(fsupc+1); glu.xlsub(jcolm1) = ito; StorageIndex istop = ito + jptr - jm1ptr; - xprune(jcolm1) = istop; // intialize xprune(jcol-1) + xprune(jcolm1) = istop; // initialize xprune(jcol-1) glu.xlsub(jcol) = istop; for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) @@ -166,7 +166,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j // Tidy up the pointers before exit glu.xsup(nsuper+1) = jcolp1; glu.supno(jcolp1) = nsuper; - xprune(jcol) = StorageIndex(nextl); // Intialize upper bound for pruning + xprune(jcol) = StorageIndex(nextl); // Initialize upper bound for pruning glu.xlsub(jcolp1) = StorageIndex(nextl); return 0; diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/inst/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index 95ba7413..e37c2fe0 100644 --- a/inst/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/inst/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ pstore(C0+i+(I)*PacketSize, c0); - // agressive vectorization and peeling + // aggressive vectorization and peeling for(Index i=0; i + * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing + * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011. + * + * Even though it is qualified as "rank-revealing", this strategy might fail for some + * rank deficient problems. When this class is used to solve linear or least-square problems + * it is thus strongly recommended to check the accuracy of the computed solution. If it + * failed, it usually helps to increase the threshold with setPivotThreshold. + * * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * @@ -331,7 +343,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) m_R.resize(m, n); m_Q.resize(m, diagSize); - // Allocate space for nonzero elements : rough estimation + // Allocate space for nonzero elements: rough estimation m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree m_Q.reserve(2*mat.nonZeros()); m_hcoeffs.resize(diagSize); @@ -640,7 +652,8 @@ struct SparseQR_QProduct : ReturnByValue=0; k--) + Index start_k = internal::is_identity::value ? numext::mini(j,diagSize-1) : diagSize-1; + for (Index k = start_k; k >=0; k--) { Scalar tau = Scalar(0); tau = m_qr.m_Q.col(k).dot(res.col(j)); diff --git a/inst/include/Eigen/src/StlSupport/StdDeque.h b/inst/include/Eigen/src/StlSupport/StdDeque.h index af158f42..6d47e757 100644 --- a/inst/include/Eigen/src/StlSupport/StdDeque.h +++ b/inst/include/Eigen/src/StlSupport/StdDeque.h @@ -36,7 +36,7 @@ namespace std \ deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -98,19 +98,7 @@ namespace std { { return deque_base::insert(position,x); } void insert(const_iterator position, size_type new_size, const value_type& x) { deque_base::insert(position, new_size, x); } -#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1) - // workaround GCC std::deque implementation - // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to - // fall-back to the default case - void resize(size_type new_size, const value_type& x) - { - if (new_size < deque_base::size()) - deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size); - else - deque_base::insert(deque_base::end(), new_size - deque_base::size(), x); - } #else - // either non-GCC or GCC between 4.1 and 10.1 // default implementation which should always work. void resize(size_type new_size, const value_type& x) { diff --git a/inst/include/Eigen/src/StlSupport/StdList.h b/inst/include/Eigen/src/StlSupport/StdList.h index e1eba498..8ba3fada 100644 --- a/inst/include/Eigen/src/StlSupport/StdList.h +++ b/inst/include/Eigen/src/StlSupport/StdList.h @@ -35,7 +35,7 @@ namespace std \ list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ diff --git a/inst/include/Eigen/src/StlSupport/StdVector.h b/inst/include/Eigen/src/StlSupport/StdVector.h index ec22821d..9fcf19bc 100644 --- a/inst/include/Eigen/src/StlSupport/StdVector.h +++ b/inst/include/Eigen/src/StlSupport/StdVector.h @@ -36,7 +36,7 @@ namespace std \ vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ diff --git a/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h b/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h index 7261c7d0..d1d3ad7f 100644 --- a/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -217,12 +217,12 @@ struct SluMatrix : SuperMatrix res.setScalarType(); // FIXME the following is not very accurate - if (MatrixType::Flags & Upper) + if (int(MatrixType::Flags) & int(Upper)) res.Mtype = SLU_TRU; - if (MatrixType::Flags & Lower) + if (int(MatrixType::Flags) & int(Lower)) res.Mtype = SLU_TRL; - eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU"); + eigen_assert(((int(MatrixType::Flags) & int(SelfAdjoint))==0) && "SelfAdjoint matrix shape not supported by SuperLU"); return res; } @@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -650,9 +650,8 @@ void SuperLU::_solve_impl(const MatrixBase &b, MatrixBase { eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); - const Index size = m_matrix.rows(); const Index rhsCols = b.cols(); - eigen_assert(size==b.rows()); + eigen_assert(m_matrix.rows()==b.rows()); m_sluOptions.Trans = NOTRANS; m_sluOptions.Fact = FACTORED; @@ -974,9 +973,8 @@ void SuperILU::_solve_impl(const MatrixBase &b, MatrixBase wrapper functions: -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) + // Defaults +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int) { umfpack_di_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_defaults(control); } -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long) +{ umfpack_dl_defaults(control); } + +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) +{ umfpack_zl_defaults(control); } + +// Report info +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int) { umfpack_di_report_info(control, info);} -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, int) { umfpack_zi_report_info(control, info);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long) +{ umfpack_dl_report_info(control, info);} + +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, SuiteSparse_long) +{ umfpack_zl_report_info(control, info);} + +// Report status +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int) { umfpack_di_report_status(control, status);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, int) { umfpack_zi_report_status(control, status);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], double) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long) +{ umfpack_dl_report_status(control, status);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, SuiteSparse_long) +{ umfpack_zl_report_status(control, status);} + +// report control +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int) { umfpack_di_report_control(control);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_report_control(control);} -inline void umfpack_free_numeric(void **Numeric, double) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long) +{ umfpack_dl_report_control(control);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) +{ umfpack_zl_report_control(control);} + +// Free numeric +inline void umfpack_free_numeric(void **Numeric, double, int) { umfpack_di_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, std::complex) +inline void umfpack_free_numeric(void **Numeric, std::complex, int) { umfpack_zi_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_symbolic(void **Symbolic, double) +inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long) +{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; } + +inline void umfpack_free_numeric(void **Numeric, std::complex, SuiteSparse_long) +{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; } + +// Free symbolic +inline void umfpack_free_symbolic(void **Symbolic, double, int) { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, std::complex) +inline void umfpack_free_symbolic(void **Symbolic, std::complex, int) { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; } +inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long) +{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; } + +inline void umfpack_free_symbolic(void **Symbolic, std::complex, SuiteSparse_long) +{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; } + +// Symbolic inline int umfpack_symbolic(int n_row,int n_col, const int Ap[], const int Ai[], const double Ax[], void **Symbolic, const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) @@ -66,7 +119,21 @@ inline int umfpack_symbolic(int n_row,int n_col, { return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); } +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info); +} +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); +} + +// Numeric inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[], void *Symbolic, void **Numeric, const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) @@ -80,7 +147,21 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex Ax[], + void *Symbolic, void **Numeric, + const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) +{ + return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info); +} + +// solve inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[], double X[], const double B[], void *Numeric, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) @@ -95,6 +176,21 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); } +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], + double X[], const double B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info); +} + +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], + std::complex X[], const std::complex B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); +} + +// Get Lunz inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double) { return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); @@ -105,6 +201,19 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_ return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, double) +{ + return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, std::complex) +{ + return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +// Get Numeric inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[], int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric) { @@ -120,18 +229,45 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex Lx[], in return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, Dx?&dx0_real:0,0,do_recip,Rs,Numeric); } +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) +{ + return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric); +} -inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], std::complex Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) +{ + double& lx0_real = numext::real_ref(Lx[0]); + double& ux0_real = numext::real_ref(Ux[0]); + double& dx0_real = numext::real_ref(Dx[0]); + return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, + Dx?&dx0_real:0,0,do_recip,Rs,Numeric); +} + +// Get Determinant +inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info); } -inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { double& mx_real = numext::real_ref(*Mx); return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); } +inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) +{ + return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info); +} + +inline SuiteSparse_long umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) +{ + double& mx_real = numext::real_ref(*Mx); + return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); +} + /** \ingroup UmfPackSupport_Module * \brief A sparse LU factorization and solver based on UmfPack @@ -164,7 +300,7 @@ class UmfPackLU : public SparseSolverBase > typedef Matrix IntRowVectorType; typedef Matrix IntColVectorType; typedef SparseMatrix LUMatrixType; - typedef SparseMatrix UmfpackMatrixType; + typedef SparseMatrix UmfpackMatrixType; typedef Ref UmfpackMatrixRef; enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -192,8 +328,8 @@ class UmfPackLU : public SparseSolverBase > ~UmfPackLU() { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex()); } inline Index rows() const { return mp_matrix.rows(); } @@ -201,7 +337,7 @@ class UmfPackLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -241,8 +377,8 @@ class UmfPackLU : public SparseSolverBase > template void compute(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); analyzePattern_impl(); factorize_impl(); @@ -257,8 +393,8 @@ class UmfPackLU : public SparseSolverBase > template void analyzePattern(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); @@ -309,7 +445,7 @@ class UmfPackLU : public SparseSolverBase > { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); if(m_numeric) - umfpack_free_numeric(&m_numeric,Scalar()); + umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); @@ -320,28 +456,28 @@ class UmfPackLU : public SparseSolverBase > * * \sa umfpackControl() */ - void umfpackReportControl() + void printUmfpackControl() { - umfpack_report_control(m_control.data(), Scalar()); + umfpack_report_control(m_control.data(), Scalar(),StorageIndex()); } /** Prints statistics collected by UmfPack. * * \sa analyzePattern(), compute() */ - void umfpackReportInfo() + void printUmfpackInfo() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar()); + umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex()); } /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization). * * \sa analyzePattern(), compute() */ - void umfpackReportStatus() { + void printUmfpackStatus() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar()); + umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex()); } /** \internal */ @@ -362,13 +498,13 @@ class UmfPackLU : public SparseSolverBase > m_symbolic = 0; m_extractedDataAreDirty = true; - umfpack_defaults(m_control.data(), Scalar()); + umfpack_defaults(m_control.data(), Scalar(),StorageIndex()); } void analyzePattern_impl() { - m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), - internal::convert_index(mp_matrix.cols()), + m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), + internal::convert_index(mp_matrix.cols()), mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), &m_symbolic, m_control.data(), m_umfpackInfo.data()); @@ -408,7 +544,7 @@ class UmfPackLU : public SparseSolverBase > // cached data to reduce reallocation, etc. mutable LUMatrixType m_l; - int m_fact_errorCode; + StorageIndex m_fact_errorCode; UmfpackControl m_control; mutable UmfpackInfo m_umfpackInfo; @@ -438,7 +574,7 @@ void UmfPackLU::extractData() const if (m_extractedDataAreDirty) { // get size of the data - int lnz, unz, rows, cols, nz_udiag; + StorageIndex lnz, unz, rows, cols, nz_udiag; umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); // allocate data @@ -464,7 +600,7 @@ template typename UmfPackLU::Scalar UmfPackLU::determinant() const { Scalar det; - umfpack_get_determinant(&det, 0, m_numeric, 0); + umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex()); return det; } @@ -477,7 +613,6 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet"); eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); - int errorCode; Scalar* x_ptr = 0; Matrix x_tmp; if(x.innerStride()!=1) @@ -489,9 +624,10 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas { if(x.innerStride()==1) x_ptr = &x.col(j).coeffRef(0); - errorCode = umfpack_solve(UMFPACK_A, - mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data()); + StorageIndex errorCode = umfpack_solve(UMFPACK_A, + mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), + m_numeric, m_control.data(), m_umfpackInfo.data()); if(x.innerStride()!=1) x.col(j) = x_tmp; if (errorCode!=0) diff --git a/inst/include/Eigen/src/misc/lapacke.h b/inst/include/Eigen/src/misc/lapacke.h index 8c7e79b0..3d8e24f5 100755 --- a/inst/include/Eigen/src/misc/lapacke.h +++ b/inst/include/Eigen/src/misc/lapacke.h @@ -43,10 +43,6 @@ #include "lapacke_config.h" #endif -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - #include #ifndef lapack_int @@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + #ifndef LAPACKE_malloc #define LAPACKE_malloc( size ) malloc( size ) #endif diff --git a/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 05a7449b..0e5d5445 100644 --- a/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -75,6 +75,32 @@ max return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } +/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other + * + * Example: \include Cwise_absolute_difference.cpp + * Output: \verbinclude Cwise_absolute_difference.out + * + * \sa absolute_difference() + */ +EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference,absolute_difference) + +/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other + * + * \sa absolute_difference() + */ +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +absolute_difference +#else +(absolute_difference) +#endif +(const Scalar &other) const +{ + return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + /** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents. * * This function computes the coefficient-wise power. @@ -314,9 +340,9 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS &n) const * * It returns the Riemann zeta function of two arguments \c *this and \a q: * - * \param *this is the exposent, it must be > 1 * \param q is the shift, it must be > 0 * + * \note *this is the exponent, it must be > 1. * \note This function supports only float and double scalar types. To support other scalar types, the user has * to provide implementations of zeta(T,T) for any scalar type T to be supported. * diff --git a/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h index ebaa3f19..13c55f4b 100644 --- a/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -10,9 +10,11 @@ typedef CwiseUnaryOp, const Derived> Inverse typedef CwiseUnaryOp, const Derived> BooleanNotReturnType; typedef CwiseUnaryOp, const Derived> ExpReturnType; +typedef CwiseUnaryOp, const Derived> Expm1ReturnType; typedef CwiseUnaryOp, const Derived> LogReturnType; typedef CwiseUnaryOp, const Derived> Log1pReturnType; typedef CwiseUnaryOp, const Derived> Log10ReturnType; +typedef CwiseUnaryOp, const Derived> Log2ReturnType; typedef CwiseUnaryOp, const Derived> CosReturnType; typedef CwiseUnaryOp, const Derived> SinReturnType; typedef CwiseUnaryOp, const Derived> TanReturnType; @@ -20,11 +22,18 @@ typedef CwiseUnaryOp, const Derived> AcosReturn typedef CwiseUnaryOp, const Derived> AsinReturnType; typedef CwiseUnaryOp, const Derived> AtanReturnType; typedef CwiseUnaryOp, const Derived> TanhReturnType; +typedef CwiseUnaryOp, const Derived> LogisticReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; +#if EIGEN_HAS_CXX11_MATH +typedef CwiseUnaryOp, const Derived> AtanhReturnType; +typedef CwiseUnaryOp, const Derived> AsinhReturnType; +typedef CwiseUnaryOp, const Derived> AcoshReturnType; +#endif typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; typedef CwiseUnaryOp, const Derived> CubeReturnType; typedef CwiseUnaryOp, const Derived> RoundReturnType; +typedef CwiseUnaryOp, const Derived> RintReturnType; typedef CwiseUnaryOp, const Derived> FloorReturnType; typedef CwiseUnaryOp, const Derived> CeilReturnType; typedef CwiseUnaryOp, const Derived> IsNaNReturnType; @@ -90,6 +99,20 @@ exp() const return ExpReturnType(derived()); } +/** \returns an expression of the coefficient-wise exponential of *this minus 1. + * + * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1, + * however, with finite precision, this function is much more accurate when \c x is close to zero. + * + * \sa Math functions, exp() + */ +EIGEN_DEVICE_FUNC +inline const Expm1ReturnType +expm1() const +{ + return Expm1ReturnType(derived()); +} + /** \returns an expression of the coefficient-wise logarithm of *this. * * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the @@ -98,7 +121,7 @@ exp() const * Example: \include Cwise_log.cpp * Output: \verbinclude Cwise_log.out * - * \sa Math functions, exp() + * \sa Math functions, log() */ EIGEN_DEVICE_FUNC inline const LogReturnType @@ -137,6 +160,18 @@ log10() const return Log10ReturnType(derived()); } +/** \returns an expression of the coefficient-wise base-2 logarithm of *this. + * + * This function computes the coefficient-wise base-2 logarithm. + * + */ +EIGEN_DEVICE_FUNC +inline const Log2ReturnType +log2() const +{ + return Log2ReturnType(derived()); +} + /** \returns an expression of the coefficient-wise square root of *this. * * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the @@ -311,7 +346,7 @@ sinh() const * Example: \include Cwise_cosh.cpp * Output: \verbinclude Cwise_cosh.out * - * \sa Math functions, tan(), sinh(), cosh() + * \sa Math functions, tanh(), sinh(), cosh() */ EIGEN_DEVICE_FUNC inline const CoshReturnType @@ -320,6 +355,50 @@ cosh() const return CoshReturnType(derived()); } +#if EIGEN_HAS_CXX11_MATH +/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AtanhReturnType +atanh() const +{ + return AtanhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AsinhReturnType +asinh() const +{ + return AsinhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AcoshReturnType +acosh() const +{ + return AcoshReturnType(derived()); +} +#endif + +/** \returns an expression of the coefficient-wise logistic of *this. + */ +EIGEN_DEVICE_FUNC +inline const LogisticReturnType +logistic() const +{ + return LogisticReturnType(derived()); +} + /** \returns an expression of the coefficient-wise inverse of *this. * * Example: \include Cwise_inverse.cpp @@ -362,6 +441,20 @@ cube() const return CubeReturnType(derived()); } +/** \returns an expression of the coefficient-wise rint of *this. + * + * Example: \include Cwise_rint.cpp + * Output: \verbinclude Cwise_rint.out + * + * \sa Math functions, ceil(), floor() + */ +EIGEN_DEVICE_FUNC +inline const RintReturnType +rint() const +{ + return RintReturnType(derived()); +} + /** \returns an expression of the coefficient-wise round of *this. * * Example: \include Cwise_round.cpp @@ -404,6 +497,45 @@ ceil() const return CeilReturnType(derived()); } +template struct ShiftRightXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type arithmetically + * shifted right by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftLeft() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftRightXpr::Type +shiftRight() const +{ + return typename ShiftRightXpr::Type(derived()); +} + + +template struct ShiftLeftXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type logically + * shifted left by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftRight() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftLeftXpr::Type +shiftLeft() const +{ + return typename ShiftLeftXpr::Type(derived()); +} + /** \returns an expression of the coefficient-wise isnan of *this. * * Example: \include Cwise_isNaN.cpp @@ -471,14 +603,12 @@ typedef CwiseUnaryOp, const Derived> LgammaRe typedef CwiseUnaryOp, const Derived> DigammaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; +typedef CwiseUnaryOp, const Derived> NdtriReturnType; /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|). * * \specialfunctions_module * - * Example: \include Cwise_lgamma.cpp - * Output: \verbinclude Cwise_lgamma.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar * type T to be supported. @@ -514,9 +644,6 @@ digamma() const * * \specialfunctions_module * - * Example: \include Cwise_erf.cpp - * Output: \verbinclude Cwise_erf.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar * type T to be supported. @@ -535,9 +662,6 @@ erf() const * * \specialfunctions_module * - * Example: \include Cwise_erfc.cpp - * Output: \verbinclude Cwise_erfc.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar * type T to be supported. @@ -550,3 +674,23 @@ erfc() const { return ErfcReturnType(derived()); } + +/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function + * function of *this. + * + * \specialfunctions_module + * + * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the + * Gaussian probability density function (integrated from minus infinity to x) is equal to y. + * + * \note This function supports only float and double scalar types. To support other scalar types, + * the user has to provide implementations of ndtri(T) for any scalar type T to be supported. + * + * \sa Math functions + */ +EIGEN_DEVICE_FUNC +inline const NdtriReturnType +ndtri() const +{ + return NdtriReturnType(derived()); +} diff --git a/inst/include/Eigen/src/plugins/BlockMethods.h b/inst/include/Eigen/src/plugins/BlockMethods.h index ac35a008..63a52a6f 100644 --- a/inst/include/Eigen/src/plugins/BlockMethods.h +++ b/inst/include/Eigen/src/plugins/BlockMethods.h @@ -40,68 +40,126 @@ typedef const VectorBlock ConstSegmentReturnType; template struct FixedSegmentReturnType { typedef VectorBlock Type; }; template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; +/// \internal inner-vector +typedef Block InnerVectorReturnType; +typedef Block ConstInnerVectorReturnType; + +/// \internal set of inner-vectors +typedef Block InnerVectorsReturnType; +typedef Block ConstInnerVectorsReturnType; + #endif // not EIGEN_PARSED_BY_DOXYGEN -/// \returns a dynamic-size expression of a block in *this. +/// \returns an expression of a block in \c *this with either dynamic or fixed sizes. /// -/// \param startRow the first row in the block -/// \param startCol the first column in the block -/// \param blockRows the number of rows in the block -/// \param blockCols the number of columns in the block +/// \param startRow the first row in the block +/// \param startCol the first column in the block +/// \param blockRows number of rows in the block, specified at either run-time or compile-time +/// \param blockCols number of columns in the block, specified at either run-time or compile-time +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// -/// Example: \include MatrixBase_block_int_int_int_int.cpp +/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int_int_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// \newin{3.4}: +/// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. +/// Here is an example with a fixed number of rows \c NRows and dynamic number of columns \c cols: +/// \code +/// mat.block(i,j,fix,cols) +/// \endcode +/// +/// This function thus fully covers the features offered by the following overloads block(Index, Index), +/// and block(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids +/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated code. +/// +/// but with less redundancy and more consistency as it does not modify the argument order +/// and seamlessly enable hybrid fixed/dynamic sizes. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size matrix, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index) +/// \sa class Block, fix, fix(int) /// -EIGEN_DEVICE_FUNC -inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) { - return BlockXpr(derived(), startRow, startCol, blockRows, blockCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type( + derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); } -/// This is the const version of block(Index,Index,Index,Index). */ -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const +/// This is the const version of block(Index,Index,NRowsType,NColsType) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const { - return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type( + derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); } - -/// \returns a dynamic-size expression of a top-right corner of *this. +/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// -/// Example: \include MatrixBase_topRightCorner_int_int.cpp +/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topRightCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr topRightCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +topRightCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of topRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const +/// This is the const version of topRightCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +topRightCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size top-right corner of *this. +/// \returns an expression of a fixed-size top-right corner of \c *this. /// /// \tparam CRows the number of rows in the corner /// \tparam CCols the number of columns in the corner @@ -114,21 +172,21 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block, block(Index,Index) /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner() { return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); } /// This is the const version of topRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); } -/// \returns an expression of a top-right corner of *this. +/// \returns an expression of a top-right corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -148,46 +206,67 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } /// This is the const version of topRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } -/// \returns a dynamic-size expression of a top-left corner of *this. +/// \returns an expression of a top-left corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_topLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr topLeftCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +topLeftCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), 0, 0, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } /// This is the const version of topLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +topLeftCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), 0, 0, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size top-left corner of *this. +/// \returns an expression of a fixed-size top-left corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -196,24 +275,24 @@ inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner() { return typename FixedBlockXpr::Type(derived(), 0, 0); } /// This is the const version of topLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0); } -/// \returns an expression of a top-left corner of *this. +/// \returns an expression of a top-left corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -233,46 +312,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } /// This is the const version of topLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } -/// \returns a dynamic-size expression of a bottom-right corner of *this. +/// \returns an expression of a bottom-right corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomRightCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +bottomRightCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of bottomRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const +/// This is the const version of bottomRightCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +bottomRightCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size bottom-right corner of *this. +/// \returns an expression of a fixed-size bottom-right corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -281,24 +383,24 @@ inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } /// This is the const version of bottomRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } -/// \returns an expression of a bottom-right corner of *this. +/// \returns an expression of a bottom-right corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -318,46 +420,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /// This is the const version of bottomRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } -/// \returns a dynamic-size expression of a bottom-left corner of *this. +/// \returns an expression of a bottom-left corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomLeftCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +bottomLeftCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), 0, + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of bottomLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const +/// This is the const version of bottomLeftCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename ConstFixedBlockXpr<...,...>::Type +#endif +bottomLeftCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), 0, + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size bottom-left corner of *this. +/// \returns an expression of a fixed-size bottom-left corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -366,24 +491,24 @@ inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); } /// This is the const version of bottomLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); } -/// \returns an expression of a bottom-left corner of *this. +/// \returns an expression of a bottom-left corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -403,45 +528,66 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) +EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } /// This is the const version of bottomLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const +EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } -/// \returns a block consisting of the top rows of *this. +/// \returns a block consisting of the top rows of \c *this. /// /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_topRows_int.cpp /// Output: \verbinclude MatrixBase_topRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr topRows(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +topRows(NRowsType n) { - return RowsBlockXpr(derived(), 0, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), 0, 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of topRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr topRows(Index n) const +/// This is the const version of topRows(NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +topRows(NRowsType n) const { - return ConstRowsBlockXpr(derived(), 0, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), 0, 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of the top rows of *this. +/// \returns a block consisting of the top rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -454,50 +600,69 @@ inline ConstRowsBlockXpr topRows(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type topRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type topRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } /// This is the const version of topRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type topRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type topRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } -/// \returns a block consisting of the bottom rows of *this. +/// \returns a block consisting of the bottom rows of \c *this. /// /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_bottomRows_int.cpp /// Output: \verbinclude MatrixBase_bottomRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr bottomRows(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +bottomRows(NRowsType n) { - return RowsBlockXpr(derived(), rows() - n, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of bottomRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr bottomRows(Index n) const +/// This is the const version of bottomRows(NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +bottomRows(NRowsType n) const { - return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of the bottom rows of *this. +/// \returns a block consisting of the bottom rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -510,51 +675,70 @@ inline ConstRowsBlockXpr bottomRows(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type bottomRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type bottomRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } /// This is the const version of bottomRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } -/// \returns a block consisting of a range of rows of *this. +/// \returns a block consisting of a range of rows of \c *this. /// /// \param startRow the index of the first row in the block /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include DenseBase_middleRows_int.cpp /// Output: \verbinclude DenseBase_middleRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr middleRows(Index startRow, Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +middleRows(Index startRow, NRowsType n) { - return RowsBlockXpr(derived(), startRow, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), startRow, 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of middleRows(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const +/// This is the const version of middleRows(Index,NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +middleRows(Index startRow, NRowsType n) const { - return ConstRowsBlockXpr(derived(), startRow, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), startRow, 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of a range of rows of *this. +/// \returns a block consisting of a range of rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param startRow the index of the first row in the block @@ -568,50 +752,69 @@ inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) { return typename NRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } /// This is the const version of middleRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } -/// \returns a block consisting of the left columns of *this. +/// \returns a block consisting of the left columns of \c *this. /// /// \param n the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_leftCols_int.cpp /// Output: \verbinclude MatrixBase_leftCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr leftCols(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +leftCols(NColsType n) { - return ColsBlockXpr(derived(), 0, 0, rows(), n); + return typename NColsBlockXpr::value>::Type + (derived(), 0, 0, rows(), internal::get_runtime_value(n)); } -/// This is the const version of leftCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr leftCols(Index n) const +/// This is the const version of leftCols(NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +leftCols(NColsType n) const { - return ConstColsBlockXpr(derived(), 0, 0, rows(), n); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, 0, rows(), internal::get_runtime_value(n)); } -/// \returns a block consisting of the left columns of *this. +/// \returns a block consisting of the left columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -624,50 +827,69 @@ inline ConstColsBlockXpr leftCols(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type leftCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type leftCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, 0, rows(), n); } /// This is the const version of leftCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type leftCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type leftCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, 0, rows(), n); } -/// \returns a block consisting of the right columns of *this. +/// \returns a block consisting of the right columns of \c *this. /// /// \param n the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_rightCols_int.cpp /// Output: \verbinclude MatrixBase_rightCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr rightCols(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +rightCols(NColsType n) { - return ColsBlockXpr(derived(), 0, cols() - n, rows(), n); + return typename NColsBlockXpr::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); } -/// This is the const version of rightCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr rightCols(Index n) const +/// This is the const version of rightCols(NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +rightCols(NColsType n) const { - return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); } -/// \returns a block consisting of the right columns of *this. +/// \returns a block consisting of the right columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -680,51 +902,70 @@ inline ConstColsBlockXpr rightCols(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type rightCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type rightCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } /// This is the const version of rightCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type rightCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type rightCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } -/// \returns a block consisting of a range of columns of *this. +/// \returns a block consisting of a range of columns of \c *this. /// /// \param startCol the index of the first column in the block /// \param numCols the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include DenseBase_middleCols_int.cpp /// Output: \verbinclude DenseBase_middleCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr middleCols(Index startCol, Index numCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +middleCols(Index startCol, NColsType numCols) { - return ColsBlockXpr(derived(), 0, startCol, rows(), numCols); + return typename NColsBlockXpr::value>::Type + (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); } -/// This is the const version of middleCols(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const +/// This is the const version of middleCols(Index,NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +middleCols(Index startCol, NColsType numCols) const { - return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); } -/// \returns a block consisting of a range of columns of *this. +/// \returns a block consisting of a range of columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param startCol the index of the first column in the block @@ -738,26 +979,26 @@ inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } /// This is the const version of middleCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } -/// \returns a fixed-size expression of a block in *this. +/// \returns a fixed-size expression of a block of \c *this. /// /// The template parameters \a NRows and \a NCols are the number of /// rows and columns in the block. @@ -768,29 +1009,35 @@ inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = /// Example: \include MatrixBase_block_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int.out /// +/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic +/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence: +/// \code +/// mat.template block(i,j) <--> mat.block(i,j,fix,fix) +/// \endcode +/// /// \note since block is a templated member, the keyword template has to be used /// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol) { return typename FixedBlockXpr::Type(derived(), startRow, startCol); } /// This is the const version of block<>(Index, Index). */ template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); } -/// \returns an expression of a block in *this. +/// \returns an expression of a block of \c *this. /// /// \tparam NRows number of rows in block as specified at compile-time /// \tparam NCols number of columns in block as specified at compile-time @@ -805,14 +1052,25 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow /// \a NRows is \a Dynamic, and the same for the number of columns. /// /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp +/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out +/// +/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic +/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence: +/// \code +/// mat.template block(i,j,rows,cols) <--> mat.block(i,j,fix(rows),fix(cols)) +/// \endcode +/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes: +/// \code +/// mat.template block(i,j,rows,NCols) <--> mat.block(i,j,rows,fix) +/// \endcode /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); @@ -820,13 +1078,14 @@ inline typename FixedBlockXpr::Type block(Index startRow, Index sta /// This is the const version of block<>(Index, Index, Index, Index). template -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); } -/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_col.cpp /// Output: \verbinclude MatrixBase_col.out @@ -834,20 +1093,20 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /** * \sa row(), class Block */ -EIGEN_DEVICE_FUNC -inline ColXpr col(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ColXpr col(Index i) { return ColXpr(derived(), i); } /// This is the const version of col(). -EIGEN_DEVICE_FUNC -inline ConstColXpr col(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); } -/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_row.cpp /// Output: \verbinclude MatrixBase_row.out @@ -855,109 +1114,166 @@ inline ConstColXpr col(Index i) const EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /** * \sa col(), class Block */ -EIGEN_DEVICE_FUNC -inline RowXpr row(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +RowXpr row(Index i) { return RowXpr(derived(), i); } /// This is the const version of row(). */ -EIGEN_DEVICE_FUNC -inline ConstRowXpr row(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); } -/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this. +/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param start the first coefficient in the segment /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_segment_int_int.cpp /// Output: \verbinclude MatrixBase_segment_int_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// -/// \sa class Block, segment(Index) +/// \sa block(Index,Index,NRowsType,NColsType), fix, fix(int), class Block /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType segment(Index start, Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +segment(Index start, NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), start, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), start, internal::get_runtime_value(n)); } -/// This is the const version of segment(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType segment(Index start, Index n) const +/// This is the const version of segment(Index,NType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +segment(Index start, NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), start, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), start, internal::get_runtime_value(n)); } -/// \returns a dynamic-size expression of the first coefficients of *this. +/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_start_int.cpp /// Output: \verbinclude MatrixBase_start_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType head(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +head(NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), 0, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), 0, internal::get_runtime_value(n)); } -/// This is the const version of head(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType head(Index n) const +/// This is the const version of head(NType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +head(NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), 0, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), 0, internal::get_runtime_value(n)); } -/// \returns a dynamic-size expression of the last coefficients of *this. +/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_end_int.cpp /// Output: \verbinclude MatrixBase_end_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType tail(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +tail(NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), this->size() - n, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); } /// This is the const version of tail(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType tail(Index n) const +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +tail(NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), this->size() - n, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); } /// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this @@ -974,11 +1290,11 @@ inline ConstSegmentReturnType tail(Index n) const /// Example: \include MatrixBase_template_int_segment.cpp /// Output: \verbinclude MatrixBase_template_int_segment.out /// -/// \sa class Block +/// \sa segment(Index,NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type segment(Index start, Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), start, n); @@ -986,14 +1302,14 @@ inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N /// This is the const version of segment(Index). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), start, n); } -/// \returns a fixed-size expression of the first coefficients of *this. +/// \returns a fixed-size expression of the first coefficients of \c *this. /// /// \only_for_vectors /// @@ -1006,11 +1322,11 @@ inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index /// Example: \include MatrixBase_template_int_start.cpp /// Output: \verbinclude MatrixBase_template_int_start.out /// -/// \sa class Block +/// \sa head(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type head(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type head(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), 0, n); @@ -1018,14 +1334,14 @@ inline typename FixedSegmentReturnType::Type head(Index n = N) /// This is the const version of head(). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type head(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), 0, n); } -/// \returns a fixed-size expression of the last coefficients of *this. +/// \returns a fixed-size expression of the last coefficients of \c *this. /// /// \only_for_vectors /// @@ -1038,11 +1354,11 @@ inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const /// Example: \include MatrixBase_template_int_end.cpp /// Output: \verbinclude MatrixBase_template_int_end.out /// -/// \sa class Block +/// \sa tail(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type tail(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type tail(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), size() - n); @@ -1050,9 +1366,77 @@ inline typename FixedSegmentReturnType::Type tail(Index n = N) /// This is the const version of tail. template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type tail(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +InnerVectorReturnType innerVector(Index outer) +{ return InnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const ConstInnerVectorReturnType innerVector(Index outer) const +{ return ConstInnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +InnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const ConstInnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) const +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/** \returns the i-th subvector (column or vector) according to the \c Direction + * \sa subVectors() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename internal::conditional::type +subVector(Index i) +{ + return typename internal::conditional::type(derived(),i); +} + +/** This is the const version of subVector(Index) */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename internal::conditional::type +subVector(Index i) const +{ + return typename internal::conditional::type(derived(),i); +} + +/** \returns the number of subvectors (rows or columns) in the direction \c Direction + * \sa subVector(Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR +Index subVectors() const +{ return (Direction==Vertical)?cols():rows(); } diff --git a/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.h b/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.h index 89f4faaa..5418dc41 100644 --- a/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -76,6 +76,20 @@ conjugate() const return ConjugateReturnType(derived()); } +/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise. +/// +EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) +/// +/// \sa conjugate() +template +EIGEN_DEVICE_FUNC +inline typename internal::conditional::type +conjugateIf() const +{ + typedef typename internal::conditional::type ReturnType; + return ReturnType(derived()); +} + /// \returns a read-only expression of the real part of \c *this. /// EIGEN_DOC_UNARY_ADDONS(real,real part function) diff --git a/inst/include/Eigen/src/plugins/IndexedViewMethods.h b/inst/include/Eigen/src/plugins/IndexedViewMethods.h new file mode 100644 index 00000000..5bfb19ac --- /dev/null +++ b/inst/include/Eigen/src/plugins/IndexedViewMethods.h @@ -0,0 +1,262 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if !defined(EIGEN_PARSED_BY_DOXYGEN) + +// This file is automatically included twice to generate const and non-const versions + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#define EIGEN_INDEXED_VIEW_METHOD_CONST const +#define EIGEN_INDEXED_VIEW_METHOD_TYPE ConstIndexedViewType +#else +#define EIGEN_INDEXED_VIEW_METHOD_CONST +#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType +#endif + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +protected: + +// define some aliases to ease readability + +template +struct IvcRowType : public internal::IndexedViewCompatibleType {}; + +template +struct IvcColType : public internal::IndexedViewCompatibleType {}; + +template +struct IvcType : public internal::IndexedViewCompatibleType {}; + +typedef typename internal::IndexedViewCompatibleType::type IvcIndex; + +template +typename IvcRowType::type +ivcRow(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().rows()),Specialized); +} + +template +typename IvcColType::type +ivcCol(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().cols()),Specialized); +} + +template +typename IvcColType::type +ivcSize(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().size()),Specialized); +} + +public: + +#endif + +template +struct EIGEN_INDEXED_VIEW_METHOD_TYPE { + typedef IndexedView::type, + typename IvcColType::type> type; +}; + +// This is the generic version + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsIndexedView, + typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type >::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type + (derived(), ivcRow(rowIndices), ivcCol(colIndices)); +} + +// The following overload returns a Block<> object + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsBlock, + typename internal::traits::type>::BlockType>::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + typedef typename internal::traits::type>::BlockType BlockType; + typename IvcRowType::type actualRowIndices = ivcRow(rowIndices); + typename IvcColType::type actualColIndices = ivcCol(colIndices); + return BlockType(derived(), + internal::first(actualRowIndices), + internal::first(actualColIndices), + internal::size(actualRowIndices), + internal::size(actualColIndices)); +} + +// The following overload returns a Scalar + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsScalar, + CoeffReturnType >::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols())); +} + +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +// The following three overloads are needed to handle raw Index[N] arrays. + +template +IndexedView::type> +operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView::type> + (derived(), rowIndices, ivcCol(colIndices)); +} + +template +IndexedView::type, const ColIndicesT (&)[ColIndicesN]> +operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView::type,const ColIndicesT (&)[ColIndicesN]> + (derived(), ivcRow(rowIndices), colIndices); +} + +template +IndexedView +operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView + (derived(), rowIndices, colIndices); +} + +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +// Overloads for 1D vectors/arrays + +template +typename internal::enable_if< + IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), + IndexedView::type> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView::type> + (derived(), IvcIndex(0), ivcCol(indices)); +} + +template +typename internal::enable_if< + (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), + IndexedView::type,IvcIndex> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView::type,IvcIndex> + (derived(), ivcRow(indices), IvcIndex(0)); +} + +template +typename internal::enable_if< + (internal::get_compile_time_incr::type>::value==1) && (!internal::is_valid_index_type::value) && (!symbolic::is_symbolic::value), + VectorBlock::value> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + typename IvcType::type actualIndices = ivcSize(indices); + return VectorBlock::value> + (derived(), internal::first(actualIndices), internal::size(actualIndices)); +} + +template +typename internal::enable_if::value, CoeffReturnType >::type +operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return Base::operator()(internal::eval_expr_given_size(id,size())); +} + +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +template +typename internal::enable_if >::type +operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView + (derived(), IvcIndex(0), indices); +} + +template +typename internal::enable_if >::type +operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView + (derived(), indices, IvcIndex(0)); +} + +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +#undef EIGEN_INDEXED_VIEW_METHOD_CONST +#undef EIGEN_INDEXED_VIEW_METHOD_TYPE + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#include "IndexedViewMethods.h" +#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#endif + +#else // EIGEN_PARSED_BY_DOXYGEN + +/** + * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices respectively. + * + * Each parameter must either be: + * - An integer indexing a single row or column + * - Eigen::all indexing the full set of respective rows or columns in increasing order + * - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions + * - Any %Eigen's vector/array of integers or expressions + * - Plain C arrays: \c int[N] + * - And more generally any type exposing the following two member functions: + * \code + * operator[]() const; + * size() const; + * \endcode + * where \c stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t). + * + * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's ranges, etc. + * + * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then this + * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case + * when all arguments are either: + * - An integer + * - Eigen::all + * - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N). + * + * Otherwise a more general IndexedView object will be returned, after conversion of the inputs + * to more suitable types \c RowIndices' and \c ColIndices'. + * + * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. + * + * See also this question and its answer for an example of how to duplicate coefficients. + * + * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) + */ +template +IndexedView_or_Block +operator()(const RowIndices& rowIndices, const ColIndices& colIndices); + +/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays + * + * \only_for_vectors + */ +template +IndexedView_or_VectorBlock +operator()(const Indices& indices); + +#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h index f1084abe..a0feef87 100644 --- a/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -39,10 +39,10 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise != operator of *this and \a other @@ -59,10 +59,10 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise min of *this and \a other diff --git a/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h index b1be3d56..0514d8f7 100644 --- a/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h +++ b/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h @@ -14,6 +14,7 @@ typedef CwiseUnaryOp, const Derived> CwiseAbsReturnType; typedef CwiseUnaryOp, const Derived> CwiseAbs2ReturnType; +typedef CwiseUnaryOp, const Derived> CwiseArgReturnType; typedef CwiseUnaryOp, const Derived> CwiseSqrtReturnType; typedef CwiseUnaryOp, const Derived> CwiseSignReturnType; typedef CwiseUnaryOp, const Derived> CwiseInverseReturnType; @@ -82,4 +83,13 @@ EIGEN_DEVICE_FUNC inline const CwiseInverseReturnType cwiseInverse() const { return CwiseInverseReturnType(derived()); } +/// \returns an expression of the coefficient-wise phase angle of \c *this +/// +/// Example: \include MatrixBase_cwiseArg.cpp +/// Output: \verbinclude MatrixBase_cwiseArg.out +/// +EIGEN_DOC_UNARY_ADDONS(cwiseArg,arg) +EIGEN_DEVICE_FUNC +inline const CwiseArgReturnType +cwiseArg() const { return CwiseArgReturnType(derived()); } diff --git a/inst/include/Eigen/src/plugins/ReshapedMethods.h b/inst/include/Eigen/src/plugins/ReshapedMethods.h new file mode 100644 index 00000000..482a6b04 --- /dev/null +++ b/inst/include/Eigen/src/plugins/ReshapedMethods.h @@ -0,0 +1,149 @@ + +#ifdef EIGEN_PARSED_BY_DOXYGEN + +/// \returns an expression of \c *this with reshaped sizes. +/// +/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or AutoSize +/// \param nCols the number of columns in the reshaped expression, specified at either run-time or compile-time, or AutoSize +/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), +/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. +/// \tparam NRowsType the type of the value handling the number of rows, typically Index. +/// \tparam NColsType the type of the value handling the number of columns, typically Index. +/// +/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp +/// Output: \verbinclude MatrixBase_reshaped_int_int.out +/// +/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. +/// Here is an example with a fixed number of rows and columns: +/// \include MatrixBase_reshaped_fixed.cpp +/// Output: \verbinclude MatrixBase_reshaped_fixed.out +/// +/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the following example: +/// \include MatrixBase_reshaped_auto.cpp +/// Output: \verbinclude MatrixBase_reshaped_auto.out +/// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and +/// that the other size is passed at compile-time using Eigen::fix as above. +/// +/// \sa class Reshaped, fix, fix(int) +/// +template +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped(NRowsType nRows, NColsType nCols); + +/// This is the const version of reshaped(NRowsType,NColsType). +template +EIGEN_DEVICE_FUNC +inline const Reshaped +reshaped(NRowsType nRows, NColsType nCols) const; + +/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector +/// +/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), +/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. +/// +/// This overloads is essentially a shortcut for `A.reshaped(AutoSize,fix<1>)`. +/// +/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this. +/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this. +/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c *this. +/// This mode is the recommended one when the particular ordering of the element is not relevant. +/// +/// Example: +/// \include MatrixBase_reshaped_to_vector.cpp +/// Output: \verbinclude MatrixBase_reshaped_to_vector.out +/// +/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType). +/// +/// \sa reshaped(NRowsType,NColsType), class Reshaped +/// +template +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped(); + +/// This is the const version of reshaped(). +template +EIGEN_DEVICE_FUNC +inline const Reshaped +reshaped() const; + +#else + +// This file is automatically included twice to generate const and non-const versions + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS +#define EIGEN_RESHAPED_METHOD_CONST const +#else +#define EIGEN_RESHAPED_METHOD_CONST +#endif + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS + +// This part is included once + +#endif + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value, + internal::get_compiletime_reshape_size::value> +reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped::value, + internal::get_compiletime_reshape_size::value> + (derived(), + internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), + internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); +} + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value, + internal::get_compiletime_reshape_size::value, + internal::get_compiletime_reshape_order::value> +reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped::value, + internal::get_compiletime_reshape_size::value, + internal::get_compiletime_reshape_order::value> + (derived(), + internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), + internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); +} + +// Views as linear vectors + +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped() EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped(derived(),size(),1); +} + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value> +reshaped() EIGEN_RESHAPED_METHOD_CONST +{ + EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER); + return Reshaped::value> + (derived(), size(), 1); +} + +#undef EIGEN_RESHAPED_METHOD_CONST + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS +#define EIGEN_RESHAPED_METHOD_2ND_PASS +#include "ReshapedMethods.h" +#undef EIGEN_RESHAPED_METHOD_2ND_PASS +#endif + +#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/inst/include/unsupported/Eigen/AdolcForward b/inst/include/unsupported/Eigen/AdolcForward index 15f5f073..56caeaeb 100644 --- a/inst/include/unsupported/Eigen/AdolcForward +++ b/inst/include/unsupported/Eigen/AdolcForward @@ -40,7 +40,7 @@ # undef realloc #endif -#include +#include "../../Eigen/Core" namespace Eigen { @@ -74,6 +74,9 @@ inline adouble imag(const adouble&) { return 0.; } inline adouble abs(const adouble& x) { return fabs(x); } inline adouble abs2(const adouble& x) { return x*x; } +inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); } +inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); } + } namespace Eigen { diff --git a/inst/include/unsupported/Eigen/AlignedVector3 b/inst/include/unsupported/Eigen/AlignedVector3 index 47a86d4c..4fa1842a 100644 --- a/inst/include/unsupported/Eigen/AlignedVector3 +++ b/inst/include/unsupported/Eigen/AlignedVector3 @@ -10,7 +10,9 @@ #ifndef EIGEN_ALIGNED_VECTOR3 #define EIGEN_ALIGNED_VECTOR3 -#include +#include "../../Eigen/Geometry" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" namespace Eigen { @@ -76,6 +78,9 @@ template class AlignedVector3 { return m_coeffs.coeffRef(index);} + inline AlignedVector3() + {} + inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, Scalar(0)) {} @@ -129,6 +134,9 @@ template class AlignedVector3 inline AlignedVector3 operator-(const AlignedVector3& other) const { return AlignedVector3(m_coeffs - other.m_coeffs); } + inline AlignedVector3 operator-() const + { return AlignedVector3(-m_coeffs); } + inline AlignedVector3 operator-=(const AlignedVector3& other) { m_coeffs -= other.m_coeffs; return *this; } @@ -221,4 +229,6 @@ struct evaluator > } +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + #endif // EIGEN_ALIGNED_VECTOR3 diff --git a/inst/include/unsupported/Eigen/ArpackSupport b/inst/include/unsupported/Eigen/ArpackSupport index a0d4820e..67c4ac83 100644 --- a/inst/include/unsupported/Eigen/ArpackSupport +++ b/inst/include/unsupported/Eigen/ArpackSupport @@ -9,7 +9,7 @@ #ifndef EIGEN_ARPACKSUPPORT_MODULE_H #define EIGEN_ARPACKSUPPORT_MODULE_H -#include +#include "../../Eigen/Core" /** \defgroup ArpackSupport_Module Arpack support module * @@ -20,12 +20,11 @@ * \endcode */ -#include +#include "../../Eigen/SparseCholesky" -#include +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" -#include +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_ARPACKSUPPORT_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/inst/include/unsupported/Eigen/AutoDiff b/inst/include/unsupported/Eigen/AutoDiff index abf5b7d6..7a4ff460 100644 --- a/inst/include/unsupported/Eigen/AutoDiff +++ b/inst/include/unsupported/Eigen/AutoDiff @@ -28,11 +28,17 @@ namespace Eigen { //@{ } +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + #include "src/AutoDiff/AutoDiffScalar.h" // #include "src/AutoDiff/AutoDiffVector.h" #include "src/AutoDiff/AutoDiffJacobian.h" +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + + + namespace Eigen { //@} } diff --git a/inst/include/unsupported/Eigen/BVH b/inst/include/unsupported/Eigen/BVH index 0161a540..666c9835 100644 --- a/inst/include/unsupported/Eigen/BVH +++ b/inst/include/unsupported/Eigen/BVH @@ -10,9 +10,9 @@ #ifndef EIGEN_BVH_MODULE_H #define EIGEN_BVH_MODULE_H -#include -#include -#include +#include "../../Eigen/Core" +#include "../../Eigen/Geometry" +#include "../../Eigen/StdVector" #include #include diff --git a/inst/include/unsupported/Eigen/CXX11/Tensor b/inst/include/unsupported/Eigen/CXX11/Tensor index bb6523d1..0938bb55 100644 --- a/inst/include/unsupported/Eigen/CXX11/Tensor +++ b/inst/include/unsupported/Eigen/CXX11/Tensor @@ -13,21 +13,11 @@ #include "../../../Eigen/Core" -#ifdef EIGEN_USE_SYCL -#undef min -#undef max -#undef isnan -#undef isinf -#undef isfinite -#include -#include -#include -#include -#endif - -#include +#if EIGEN_HAS_CXX11 #include "../SpecialFunctions" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" @@ -43,44 +33,25 @@ * Much of the documentation can be found \ref eigen_tensors "here". */ +#include +#include #include #include #include - -#ifdef _WIN32 -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#else -#include -#endif - -#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 #include -#endif +#include -#ifdef _WIN32 -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif - -#ifdef EIGEN_USE_THREADS +#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) #include "ThreadPool" #endif #ifdef EIGEN_USE_GPU -#include -#include -#if __cplusplus >= 201103L -#include -#include -#endif + #include + #if defined(EIGEN_USE_HIP) + #include + #else + #include + #endif #endif #include "src/Tensor/TensorMacros.h" @@ -90,7 +61,10 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceCuda.h" +#include "src/Tensor/TensorDeviceGpu.h" +#ifndef gpu_assert +#define gpu_assert(x) +#endif #include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" @@ -103,18 +77,19 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionCuda.h" +#include "src/Tensor/TensorReductionGpu.h" #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionCuda.h" +#include "src/Tensor/TensorContractionGpu.h" #include "src/Tensor/TensorConversion.h" #include "src/Tensor/TensorConvolution.h" #include "src/Tensor/TensorFFT.h" @@ -136,8 +111,15 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGenerator.h" #include "src/Tensor/TensorAssign.h" #include "src/Tensor/TensorScan.h" +#include "src/Tensor/TensorTrace.h" + +#ifdef EIGEN_USE_SYCL +#include "src/Tensor/TensorReductionSycl.h" +#include "src/Tensor/TensorConvolutionSycl.h" +#include "src/Tensor/TensorContractionSycl.h" +#include "src/Tensor/TensorScanSycl.h" +#endif -#include "src/Tensor/TensorSycl.h" #include "src/Tensor/TensorExecutor.h" #include "src/Tensor/TensorDevice.h" @@ -149,6 +131,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorIO.h" -#include +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" +#endif // EIGEN_HAS_CXX11 //#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/inst/include/unsupported/Eigen/CXX11/TensorSymmetry b/inst/include/unsupported/Eigen/CXX11/TensorSymmetry index fb1b0c0f..b09c5e47 100644 --- a/inst/include/unsupported/Eigen/CXX11/TensorSymmetry +++ b/inst/include/unsupported/Eigen/CXX11/TensorSymmetry @@ -10,9 +10,9 @@ #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE #define EIGEN_CXX11_TENSORSYMMETRY_MODULE -#include +#include "Tensor" -#include +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" #include "src/util/CXX11Meta.h" @@ -33,7 +33,7 @@ #include "src/TensorSymmetry/StaticSymmetry.h" #include "src/TensorSymmetry/DynamicSymmetry.h" -#include +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE diff --git a/inst/include/unsupported/Eigen/CXX11/ThreadPool b/inst/include/unsupported/Eigen/CXX11/ThreadPool index 09d637e9..c5cafb2a 100644 --- a/inst/include/unsupported/Eigen/CXX11/ThreadPool +++ b/inst/include/unsupported/Eigen/CXX11/ThreadPool @@ -12,7 +12,7 @@ #include "../../../Eigen/Core" -#include +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module * @@ -30,10 +30,9 @@ // The code depends on CXX11, so only include the module if the // compiler supports it. -#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 +#if (EIGEN_COMP_CXXVER >= 11) #include #include -#include #include #include @@ -44,22 +43,32 @@ #include #include #include +#include + +// There are non-parenthesized calls to "max" in the header, +// which trigger a check in test/main.h causing compilation to fail. +// We work around the check here by removing the check for max in +// the case where we have to emulate thread_local. +#ifdef max +#undef max +#endif +#include #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" #include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadYield.h" +#include "src/ThreadPool/ThreadCancel.h" #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" -#include "src/ThreadPool/SimpleThreadPool.h" +#include "src/ThreadPool/Barrier.h" #include "src/ThreadPool/NonBlockingThreadPool.h" #endif -#include +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CXX11_THREADPOOL_MODULE - diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md b/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md index da70fa21..2f65b1b0 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -3,8 +3,6 @@ Tensors are multidimensional arrays of elements. Elements are typically scalars, but more complex types such as strings are also supported. -[TOC] - ## Tensor Classes You can manipulate a tensor with one of the following classes. They all are in @@ -21,7 +19,7 @@ matrix. Tensors of this class are resizable. For example, if you assign a tensor of a different size to a Tensor, that tensor is resized to match its new value. -#### Constructor `Tensor(size0, size1, ...)` +#### Constructor Tensor(size0, size1, ...) Constructor for a Tensor. The constructor must be passed `rank` integers indicating the sizes of the instance along each of the the `rank` @@ -34,7 +32,7 @@ dimensions. // Resize t_3d by assigning a tensor of different sizes, but same rank. t_3d = Tensor(3, 4, 3); -#### Constructor `Tensor(size_array)` +#### Constructor Tensor(size_array) Constructor where the sizes for the constructor are specified as an array of values instead of an explicitly list of parameters. The array type to use is @@ -45,7 +43,7 @@ from an initializer list. Tensor t_2d({5, 7}); -### Class `TensorFixedSize>` +### Class TensorFixedSize> Class to use for tensors of fixed size, where the size is known at compile time. Fixed sized tensors can provide very fast computations because all their @@ -57,7 +55,7 @@ tensor data is held onto the stack and does not cause heap allocation and free. // Create a 4 x 3 tensor of floats. TensorFixedSize> t_4x3; -### Class `TensorMap>` +### Class TensorMap> This is the class to use to create a tensor on top of memory allocated and owned by another part of your code. It allows to view any piece of allocated @@ -67,7 +65,7 @@ data are stored. A TensorMap is not resizable because it does not own the memory where its data are stored. -#### Constructor `TensorMap>(data, size0, size1, ...)` +#### Constructor TensorMap>(data, size0, size1, ...) Constructor for a Tensor. The constructor must be passed a pointer to the storage for the data, and "rank" size attributes. The storage has to be @@ -83,17 +81,17 @@ large enough to hold all the data. // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. - TensorFixedSize> t_4x3; + TensorFixedSize> t_4x3; TensorMap> t_12(t_4x3.data(), 12); -#### Class `TensorRef` +#### Class TensorRef See Assigning to a TensorRef below. ## Accessing Tensor Elements -#### ` tensor(index0, index1...)` +#### tensor(index0, index1...) Return the element at position `(index0, index1...)` in tensor `tensor`. You must pass as many parameters as the rank of `tensor`. @@ -278,7 +276,7 @@ Simiarly, assigning an expression to a TensorMap causes its evaluation. Like tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to have the rank and sizes of the expression that are assigned to them. -#### Calling `eval()`. +#### Calling eval(). When you compute large composite expressions, you sometimes want to tell Eigen that an intermediate value in the expression tree is worth evaluating ahead of @@ -355,7 +353,7 @@ call for the right hand side: (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); -#### Assigning to a `TensorRef`. +#### Assigning to a TensorRef. If you need to access only a few elements from the value of an expression you can avoid materializing the value in a full tensor by using a TensorRef. @@ -430,8 +428,11 @@ This is exactly the same as not inserting a `device()` call. #### Evaluating with a Thread Pool + // Create the Eigen ThreadPool + Eigen::ThreadPool pool(8 /* number of threads in pool */) + // Create the Eigen ThreadPoolDevice. - Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); + Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */); // Now just use the device when evaluating expressions. Eigen::Tensor c(30, 50); @@ -452,24 +453,24 @@ memory for tensors with cuda. In the documentation of the tensor methods and Operation we mention datatypes that are tensor-type specific: -#### `::``Dimensions` +#### ::Dimensions Acts like an array of ints. Has an `int size` attribute, and can be indexed like an array to access individual values. Used to represent the dimensions of a tensor. See `dimensions()`. -#### `::``Index` +#### ::Index Acts like an `int`. Used for indexing tensors along their dimensions. See `operator()`, `dimension()`, and `size()`. -#### `::``Scalar` +#### ::Scalar Represents the datatype of individual tensor elements. For example, for a `Tensor`, `Scalar` is the type `float`. See `setConstant()`. -#### `` +#### We use this pseudo type to indicate that a tensor Operation is returned by a method. We indicate in the text the type and dimensions of the tensor that the @@ -489,7 +490,7 @@ Tensor, TensorFixedSize, and TensorMap. ## Metadata -### `int NumDimensions` +### int NumDimensions Constant value indicating the number of dimensions of a Tensor. This is also known as the tensor "rank". @@ -498,7 +499,7 @@ known as the tensor "rank". cout << "Dims " << a.NumDimensions; => Dims 2 -### `Dimensions dimensions()` +### Dimensions dimensions() Returns an array-like object representing the dimensions of the tensor. The actual type of the `dimensions()` result is `::``Dimensions`. @@ -516,7 +517,7 @@ If you use a C++11 compiler, you can use `auto` to simplify the code: << ", dim 1: " << d[1]; => Dim size: 2, dim 0: 3, dim 1: 4 -### `Index dimension(Index n)` +### Index dimension(Index n) Returns the n-th dimension of the tensor. The actual type of the `dimension()` result is `::``Index`, but you can @@ -527,7 +528,7 @@ always use it like an int. cout << "Dim 1: " << dim1; => Dim 1: 4 -### `Index size()` +### Index size() Returns the total number of elements in the tensor. This is the product of all the tensor dimensions. The actual type of the `size()` result is @@ -602,7 +603,7 @@ You can use one of the methods below to initialize the tensor memory. These have an immediate effect on the tensor and return the tensor itself as a result. These are not tensor Operations which delay evaluation. -### ` setConstant(const Scalar& val)` +### setConstant(const Scalar& val) Sets all elements of the tensor to the constant value `val`. `Scalar` is the type of data stored in the tensor. You can pass any value that is @@ -630,7 +631,7 @@ has a copy constructor and an `operator=()`: yolo yolo yolo -### ` setZero()` +### setZero() Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. Returns the tensor itself in case you want to chain another call. @@ -644,7 +645,7 @@ Returns the tensor itself in case you want to chain another call. 0 0 0 0 -### ` setValues({..initializer_list})` +### setValues({..initializer_list}) Fills the tensor with explicit values specified in a std::initializer_list. The type of the initializer list depends on the type and rank of the tensor. @@ -680,7 +681,7 @@ code only sets the values of the first row of the tensor. 10 20 30 1000 1000 1000 -### ` setRandom()` +### setRandom() Fills the tensor with random values. Returns the tensor itself in case you want to chain another call. @@ -747,7 +748,7 @@ values of a tensor expression, the expression must either be evaluated or wrapped in a TensorRef. -### `Scalar* data()` and `const Scalar* data() const` +### Scalar* data() and const Scalar* data() const Returns a pointer to the storage for the tensor. The pointer is const if the tensor was const. This allows direct access to the data. The layout of the @@ -775,7 +776,7 @@ The chain of Operation is evaluated lazily, typically when it is assigned to a tensor. See "Controlling when Expression are Evaluated" for more details about their evaluation. -### ` constant(const Scalar& val)` +### constant(const Scalar& val) Returns a tensor of the same type and dimensions as the original tensor but where all elements have the value `val`. @@ -803,7 +804,7 @@ tensor, or multiply every element of a tensor by a scalar. 0.6 0.6 0.6 0.6 0.6 0.6 -### ` random()` +### random() Returns a tensor of the same type and dimensions as the current tensor but where all elements have random values. @@ -833,7 +834,7 @@ All these operations take a single input tensor as argument and return a tensor of the same type and dimensions as the tensor to which they are applied. The requested operations are applied to each element independently. -### ` operator-()` +### operator-() Returns a tensor of the same type and dimensions as the original tensor containing the opposite values of the original tensor. @@ -852,42 +853,42 @@ containing the opposite values of the original tensor. -1 -1 -1 -1 -1 -1 -### ` sqrt()` +### sqrt() Returns a tensor of the same type and dimensions as the original tensor containing the square roots of the original tensor. -### ` rsqrt()` +### rsqrt() Returns a tensor of the same type and dimensions as the original tensor containing the inverse square roots of the original tensor. -### ` square()` +### square() Returns a tensor of the same type and dimensions as the original tensor containing the squares of the original tensor values. -### ` inverse()` +### inverse() Returns a tensor of the same type and dimensions as the original tensor containing the inverse of the original tensor values. -### ` exp()` +### exp() Returns a tensor of the same type and dimensions as the original tensor containing the exponential of the original tensor. -### ` log()` +### log() Returns a tensor of the same type and dimensions as the original tensor containing the natural logarithms of the original tensor. -### ` abs()` +### abs() Returns a tensor of the same type and dimensions as the original tensor containing the absolute values of the original tensor. -### ` pow(Scalar exponent)` +### pow(Scalar exponent) Returns a tensor of the same type and dimensions as the original tensor containing the coefficients of the original tensor to the power of the @@ -914,17 +915,17 @@ cubic roots of an int Tensor: 0 1 2 3 4 5 -### ` operator * (Scalar scale)` +### operator * (Scalar scale) Multiplies all the coefficients of the input tensor by the provided scale. -### ` cwiseMax(Scalar threshold)` +### cwiseMax(Scalar threshold) TODO -### ` cwiseMin(Scalar threshold)` +### cwiseMin(Scalar threshold) TODO -### ` unaryExpr(const CustomUnaryOp& func)` +### unaryExpr(const CustomUnaryOp& func) TODO @@ -936,39 +937,39 @@ dimensions as the tensors to which they are applied, and unless otherwise specified it is also of the same type. The requested operations are applied to each pair of elements independently. -### ` operator+(const OtherDerived& other)` +### operator+(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise sums of the inputs. -### ` operator-(const OtherDerived& other)` +### operator-(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise differences of the inputs. -### ` operator*(const OtherDerived& other)` +### operator*(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise products of the inputs. -### ` operator/(const OtherDerived& other)` +### operator/(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise quotients of the inputs. This operator is not supported for integer types. -### ` cwiseMax(const OtherDerived& other)` +### cwiseMax(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise maximums of the inputs. -### ` cwiseMin(const OtherDerived& other)` +### cwiseMin(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise mimimums of the inputs. -### ` Logical operators` +### Logical operators The following logical operators are supported as well: @@ -1126,55 +1127,107 @@ scalar, represented as a zero-dimension tensor. 276 -### ` sum(const Dimensions& new_dims)` -### ` sum()` +### sum(const Dimensions& new_dims) +### sum() Reduce a tensor using the sum() operator. The resulting values are the sum of the reduced values. -### ` mean(const Dimensions& new_dims)` -### ` mean()` +### mean(const Dimensions& new_dims) +### mean() Reduce a tensor using the mean() operator. The resulting values are the mean of the reduced values. -### ` maximum(const Dimensions& new_dims)` -### ` maximum()` +### maximum(const Dimensions& new_dims) +### maximum() Reduce a tensor using the maximum() operator. The resulting values are the largest of the reduced values. -### ` minimum(const Dimensions& new_dims)` -### ` minimum()` +### minimum(const Dimensions& new_dims) +### minimum() Reduce a tensor using the minimum() operator. The resulting values are the smallest of the reduced values. -### ` prod(const Dimensions& new_dims)` -### ` prod()` +### prod(const Dimensions& new_dims) +### prod() Reduce a tensor using the prod() operator. The resulting values are the product of the reduced values. -### ` all(const Dimensions& new_dims)` -### ` all()` +### all(const Dimensions& new_dims) +### all() Reduce a tensor using the all() operator. Casts tensor to bool and then checks whether all elements are true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### ` any(const Dimensions& new_dims)` -### ` any()` +### any(const Dimensions& new_dims) +### any() Reduce a tensor using the any() operator. Casts tensor to bool and then checks whether any element is true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### ` reduce(const Dimensions& new_dims, const Reducer& reducer)` +### reduce(const Dimensions& new_dims, const Reducer& reducer) Reduce a tensor using a user-defined reduction operator. See `SumReducer` in TensorFunctors.h for information on how to implement a reduction operator. +## Trace + +A *Trace* operation returns a tensor with fewer dimensions than the original +tensor. It returns a tensor whose elements are the sum of the elements of the +original tensor along the main diagonal for a list of specified dimensions, the +"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions +are passed as an input parameter to the operation, are of type `::``Dimensions` +, and have the same requirements when passed as an input parameter. In addition, +the trace dimensions must have the same size. + +Example: Trace along 2 dimensions. + + // Create a tensor of 3 dimensions + Eigen::Tensor a(2, 2, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}}); + // Specify the dimensions along which the trace will be computed. + // In this example, the trace can only be computed along the dimensions + // with indices 0 and 1 + Eigen::array dims({0, 1}); + // The output tensor contains all but the trace dimensions. + Tensor a_trace = a.trace(dims); + cout << "a_trace:" << endl; + cout << a_trace << endl; + => + a_trace: + 11 + 13 + 15 + + +### trace(const Dimensions& new_dims) +### trace() + +As a special case, if no parameter is passed to the operation, trace is computed +along *all* dimensions of the input tensor. + +Example: Trace along all dimensions. + + // Create a tensor of 3 dimensions, with all dimensions having the same size. + Eigen::Tensor a(3, 3, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, + {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}, + {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}); + // Result is a zero dimension tensor + Tensor a_trace = a.trace(); + cout<<"a_trace:"< + a_trace: + 42 + + ## Scan Operations A *Scan* operation returns a tensor with the same dimensions as the original @@ -1204,18 +1257,18 @@ dd a comment to this line 1 3 6 4 9 15 -### ` cumsum(const Index& axis)` +### cumsum(const Index& axis) Perform a scan by summing consecutive entries. -### ` cumprod(const Index& axis)` +### cumprod(const Index& axis) Perform a scan by multiplying consecutive entries. ## Convolutions -### ` convolve(const Kernel& kernel, const Dimensions& dims)` +### convolve(const Kernel& kernel, const Dimensions& dims) Returns a tensor that is the output of the convolution of the input tensor with the kernel, along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor @@ -1258,7 +1311,7 @@ These operations return a Tensor with different dimensions than the original Tensor. They can be used to access slices of tensors, see them with different dimensions, or pad tensors with additional data. -### ` reshape(const Dimensions& new_dims)` +### reshape(const Dimensions& new_dims) Returns a view of the input tensor that has been reshaped to the specified new dimensions. The argument new_dims is an array of Index values. The @@ -1337,7 +1390,7 @@ Note that "b" itself was not reshaped but that instead the assignment is done to the reshape view of b. -### ` shuffle(const Shuffle& shuffle)` +### shuffle(const Shuffle& shuffle) Returns a copy of the input tensor whose dimensions have been reordered according to the specified permutation. The argument shuffle @@ -1378,7 +1431,7 @@ Let's rewrite the previous example to take advantage of this feature: output.shuffle({2, 0, 1}) = input; -### ` stride(const Strides& strides)` +### stride(const Strides& strides) Returns a view of the input tensor that strides (skips stride-1 elements) along each of the dimensions. The argument strides is an @@ -1404,7 +1457,7 @@ It is possible to assign a tensor to a stride: output.stride({2, 3, 4}) = input; -### ` slice(const StartIndices& offsets, const Sizes& extents)` +### slice(const StartIndices& offsets, const Sizes& extents) Returns a sub-tensor of the given tensor. For each dimension i, the slice is made of the coefficients stored between offset[i] and offset[i] + extents[i] in @@ -1430,7 +1483,7 @@ the input tensor. 600 700 -### ` chip(const Index offset, const Index dim)` +### chip(const Index offset, const Index dim) A chip is a special kind of slice. It is the subtensor at the given offset in the dimension dim. The returned tensor has one fewer dimension than the input @@ -1481,7 +1534,7 @@ lvalue. For example: 0 0 0 -### ` reverse(const ReverseDimensions& reverse)` +### reverse(const ReverseDimensions& reverse) Returns a view of the input tensor that reverses the order of the coefficients along a subset of the dimensions. The argument reverse is an array of boolean @@ -1511,7 +1564,7 @@ of a 2D tensor: 0 100 200 -### ` broadcast(const Broadcast& broadcast)` +### broadcast(const Broadcast& broadcast) Returns a view of the input tensor in which the input is replicated one to many times. @@ -1535,11 +1588,11 @@ made in each of the dimensions. 0 100 200 0 100 200 300 400 500 300 400 500 -### ` concatenate(const OtherDerived& other, Axis axis)` +### concatenate(const OtherDerived& other, Axis axis) TODO -### ` pad(const PaddingDimensions& padding)` +### pad(const PaddingDimensions& padding) Returns a view of the input tensor in which the input is padded with zeros. @@ -1564,7 +1617,7 @@ Returns a view of the input tensor in which the input is padded with zeros. 0 0 0 0 -### ` extract_patches(const PatchDims& patch_dims)` +### extract_patches(const PatchDims& patch_dims) Returns a tensor of coefficient patches extracted from the input tensor, where each patch is of dimension specified by 'patch_dims'. The returned tensor has @@ -1575,83 +1628,83 @@ dimension in RowMajor layout. For example, given the following input tensor: - Eigen::Tensor tensor(3,4); - tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, - {4.0f, 5.0f, 6.0f, 7.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}); + Eigen::Tensor tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); - cout << "tensor: " << endl << tensor << endl; -=> -tensor: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 + cout << "tensor: " << endl << tensor << endl; + => + tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 Six 2x2 patches can be extracted and indexed using the following code: - Eigen::Tensor patch; - Eigen::array patch_dims; - patch_dims[0] = 2; - patch_dims[1] = 2; - patch = tensor.extract_patches(patch_dims); - for (int k = 0; k < 6; ++k) { - cout << "patch index: " << k << endl; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - if (DataLayout == ColMajor) { - cout << patch(i, j, k) << " "; - } else { - cout << patch(k, i, j) << " "; - } + Eigen::Tensor patch; + Eigen::array patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } + } + cout << endl; } - cout << endl; } - } This code results in the following output when the data layout is ColMajor: -patch index: 0 -0 1 -4 5 -patch index: 1 -4 5 -8 9 -patch index: 2 -1 2 -5 6 -patch index: 3 -5 6 -9 10 -patch index: 4 -2 3 -6 7 -patch index: 5 -6 7 -10 11 + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 4 5 + 8 9 + patch index: 2 + 1 2 + 5 6 + patch index: 3 + 5 6 + 9 10 + patch index: 4 + 2 3 + 6 7 + patch index: 5 + 6 7 + 10 11 This code results in the following output when the data layout is RowMajor: (NOTE: the set of patches is the same as in ColMajor, but are indexed differently). -patch index: 0 -0 1 -4 5 -patch index: 1 -1 2 -5 6 -patch index: 2 -2 3 -6 7 -patch index: 3 -4 5 -8 9 -patch index: 4 -5 6 -9 10 -patch index: 5 -6 7 -10 11 - -### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 1 2 + 5 6 + patch index: 2 + 2 3 + 6 7 + patch index: 3 + 4 5 + 8 9 + patch index: 4 + 5 6 + 9 10 + patch index: 5 + 6 7 + 10 11 + +### extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type) Returns a tensor of coefficient image patches extracted from the input tensor, which is expected to have dimensions ordered as follows (depending on the data @@ -1681,32 +1734,34 @@ sizes: *) columns: 5 *) batch: 7 - Tensor tensor(2,3,5,7); - Tensor tensor_row_major = tensor.swap_layout(); + Tensor tensor(2,3,5,7); + Tensor tensor_row_major = tensor.swap_layout(); 2x2 image patches can be extracted and indexed using the following code: *) 2D patch: ColMajor (patch indexed by second-to-last dimension) - Tensor twod_patch; - twod_patch = tensor.extract_image_patches<2, 2>(); - // twod_patch.dimension(0) == 2 - // twod_patch.dimension(1) == 2 - // twod_patch.dimension(2) == 2 - // twod_patch.dimension(3) == 3*5 - // twod_patch.dimension(4) == 7 + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 *) 2D patch: RowMajor (patch indexed by the second dimension) - Tensor twod_patch_row_major; - twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); - // twod_patch_row_major.dimension(0) == 7 - // twod_patch_row_major.dimension(1) == 3*5 - // twod_patch_row_major.dimension(2) == 2 - // twod_patch_row_major.dimension(3) == 2 - // twod_patch_row_major.dimension(4) == 2 + + Tensor twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 ## Special Operations -### ` cast()` +### cast() Returns a tensor of type T with the same dimensions as the original tensor. The returned tensor contains the values of the original tensor converted to @@ -1735,7 +1790,7 @@ but you can easily cast the tensors to floats to do the division: 1 2 2 -### ` eval()` +### eval() TODO diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 00295a25..8cac2bb1 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -112,7 +112,7 @@ class Tensor : public TensorBase - EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -388,6 +388,7 @@ class Tensor : public TensorBase(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase& other) @@ -398,6 +399,20 @@ class Tensor : public TensorBase::run(assign, DefaultDevice()); } + #if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(Self&& other) + : m_storage(std::move(other.m_storage)) + { + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) + { + m_storage = std::move(other.m_storage); + return *this; + } + #endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { @@ -462,6 +477,18 @@ class Tensor : public TensorBase + EIGEN_DEVICE_FUNC + void resize(const Eigen::IndexList& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#endif + /** Custom Dimension */ #ifdef EIGEN_HAS_SFINAE template > : public traits template struct eval, Eigen::Dense> { - typedef const TensorIndexTupleOp& type; + typedef const TensorIndexTupleOpEIGEN_DEVICE_REF type; }; template @@ -82,28 +82,35 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions Dimensions; static const int NumDims = internal::array_size::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -117,7 +124,13 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: TensorEvaluator m_impl; @@ -147,7 +160,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorTupleReducerOp& type; + typedef const TensorTupleReducerOpEIGEN_DEVICE_REF type; }; template @@ -172,7 +185,7 @@ class TensorTupleReducerOp : public TensorBase, Devi typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; static const int NumDims = internal::array_size::value; typedef array StrideDims; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory TupleStorageMem; enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, - Layout = TensorEvaluator >, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator >, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { - + m_return_dim(op.return_dim()) + { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); @@ -231,19 +252,22 @@ struct TensorEvaluator, Devi } else { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - m_stride_div = m_strides[m_return_dim]; + } + // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. + m_stride_div = ((m_return_dim >= 0) && + (m_return_dim < static_cast(m_strides.size()))) + ? m_strides[m_return_dim] : 1; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -252,7 +276,13 @@ struct TensorEvaluator, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_orig_impl.bind(cgh); + } +#endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -288,7 +318,7 @@ struct TensorEvaluator, Devi protected: TensorEvaluator, Device> m_orig_impl; TensorEvaluator >, Device> m_impl; - const int m_return_dim; + const Index m_return_dim; StrideDims m_strides; Index m_stride_mod; Index m_stride_div; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 166be200..e5811d63 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -34,6 +34,7 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const std::size_t NumDimensions = internal::traits::NumDimensions; static const int Layout = internal::traits::Layout; + typedef typename traits::PointerType PointerType; enum { Flags = 0 @@ -67,6 +68,8 @@ class TensorAssignOp : public TensorBase typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; + static const int NumDims = Eigen::internal::traits::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} @@ -94,20 +97,41 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; - static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int PacketSize = PacketType::size; + static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + RightTensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT( + (static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -118,7 +142,7 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non @@ -127,7 +151,19 @@ struct TensorEvaluator, Device> // by the rhs to the lhs. return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync( + m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); } @@ -136,6 +172,7 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); @@ -163,12 +200,41 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::merge( + m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + if (TensorEvaluator::RawAccess && + m_leftImpl.data() != NULL) { + // If destination has raw data access, we pass it as a potential + // destination for a block descriptor evaluation. + desc.template AddDestinationBuffer( + /*dst_base=*/m_leftImpl.data() + desc.offset(), + /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); + } + + RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); + // If block was evaluated into a destination, there is no need to do assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + m_leftImpl.writeBlock(desc, block); + } + block.cleanup(); + } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } +#endif - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } private: TensorEvaluator m_leftImpl; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index f573608d..35b6458e 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -20,7 +20,7 @@ namespace Eigen { * \brief The tensor base class. * * This class is the common parent of the Tensor and TensorMap class, thus - * making it possible to use either class interchangably in expressions. + * making it possible to use either class interchangeably in expressions. */ #ifndef EIGEN_PARSED_BY_DOXYGEN // FIXME Doxygen does not like the inheritance with different template parameters @@ -135,6 +135,78 @@ class TensorBase return unaryExpr(internal::scalar_digamma_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0() const { + return unaryExpr(internal::scalar_bessel_i0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0e() const { + return unaryExpr(internal::scalar_bessel_i0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1() const { + return unaryExpr(internal::scalar_bessel_i1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1e() const { + return unaryExpr(internal::scalar_bessel_i1e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j0() const { + return unaryExpr(internal::scalar_bessel_j0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y0() const { + return unaryExpr(internal::scalar_bessel_y0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j1() const { + return unaryExpr(internal::scalar_bessel_j1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y1() const { + return unaryExpr(internal::scalar_bessel_y1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0() const { + return unaryExpr(internal::scalar_bessel_k0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0e() const { + return unaryExpr(internal::scalar_bessel_k0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1() const { + return unaryExpr(internal::scalar_bessel_k1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1e() const { + return unaryExpr(internal::scalar_bessel_k1e_op()); + } + // igamma(a = this, x = other) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -142,6 +214,20 @@ class TensorBase return binaryExpr(other.derived(), internal::scalar_igamma_op()); } + // igamma_der_a(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma_der_a(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op()); + } + + // gamma_sample_der_alpha(alpha = this, sample = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + gamma_sample_der_alpha(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op()); + } + // igammac(a = this, x = other) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -176,9 +262,15 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ndtri() const { + return unaryExpr(internal::scalar_ndtri_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> sigmoid() const { - return unaryExpr(internal::scalar_sigmoid_op()); + return unaryExpr(internal::scalar_logistic_op()); } EIGEN_DEVICE_FUNC @@ -187,6 +279,12 @@ class TensorBase return unaryExpr(internal::scalar_exp_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + expm1() const { + return unaryExpr(internal::scalar_expm1_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> log() const { @@ -199,6 +297,12 @@ class TensorBase return unaryExpr(internal::scalar_log1p_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log2() const { + return unaryExpr(internal::scalar_log2_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> abs() const { @@ -206,9 +310,17 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + clip(Scalar min, Scalar max) const { + return unaryExpr(internal::scalar_clamp_op(min, max)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional::IsComplex, + TensorCwiseUnaryOp, const Derived>, + Derived>::type conjugate() const { - return unaryExpr(internal::scalar_conjugate_op()); + return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); } EIGEN_DEVICE_FUNC @@ -289,22 +401,27 @@ class TensorBase return unaryExpr(internal::scalar_mod_op(rhs)); } + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > cwiseMax(Scalar threshold) const { - return cwiseMax(constant(threshold)); + return cwiseMax(constant(threshold)); } + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > cwiseMin(Scalar threshold) const { - return cwiseMin(constant(threshold)); + return cwiseMin(constant(threshold)); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorConversionOp + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional::value, + Derived, + TensorConversionOp >::type cast() const { - return TensorConversionOp(derived()); + return choose(Cond::value>(), derived(), TensorConversionOp(derived())); } EIGEN_DEVICE_FUNC @@ -313,6 +430,12 @@ class TensorBase return unaryExpr(internal::scalar_round_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rint() const { + return unaryExpr(internal::scalar_rint_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> ceil() const { @@ -357,16 +480,16 @@ class TensorBase return binaryExpr(other.derived(), internal::scalar_quotient_op()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_max_op()); + return binaryExpr(other.derived(), internal::scalar_max_op()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_min_op()); + return binaryExpr(other.derived(), internal::scalar_min_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -481,9 +604,15 @@ class TensorBase typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp + const TensorContractionOp contract(const OtherDerived& other, const Dimensions& dims) const { - return TensorContractionOp(derived(), other.derived(), dims); + return TensorContractionOp(derived(), other.derived(), dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { + return TensorContractionOp(derived(), other.derived(), dims, output_kernel); } // Convolutions. @@ -496,8 +625,8 @@ class TensorBase // Fourier transforms template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorFFTOp - fft(const FFT& fft) const { - return TensorFFTOp(derived(), fft); + fft(const FFT& dims) const { + return TensorFFTOp(derived(), dims); } // Scan. @@ -559,51 +688,53 @@ class TensorBase return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> maximum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); } - const TensorReductionOp, const DimensionList, const Derived> + template + const TensorReductionOp, const DimensionList, const Derived> maximum() const { DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> minimum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); } - const TensorReductionOp, const DimensionList, const Derived> + template + const TensorReductionOp, const DimensionList, const Derived> minimum() const { DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > + const TensorReductionOp::value, Derived, TensorConversionOp >::type > all(const Dims& dims) const { return cast().reduce(dims, internal::AndReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > all() const { DimensionList in_dims; return cast().reduce(in_dims, internal::AndReducer()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > + const TensorReductionOp::value, Derived, TensorConversionOp >::type > any(const Dims& dims) const { return cast().reduce(dims, internal::OrReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > any() const { DimensionList in_dims; return cast().reduce(in_dims, internal::OrReducer()); @@ -615,7 +746,7 @@ class TensorBase const array, const Derived> argmax() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, @@ -628,7 +759,7 @@ class TensorBase const array, const Derived> argmin() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, @@ -639,7 +770,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, const Derived> - argmax(const int return_dim) const { + argmax(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -652,7 +783,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, const Derived> - argmin(const int return_dim) const { + argmin(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -667,10 +798,22 @@ class TensorBase return TensorReductionOp(derived(), dims, reducer); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTraceOp + trace(const Dims& dims) const { + return TensorTraceOp(derived(), dims); + } + + const TensorTraceOp, const Derived> + trace() const { + DimensionList in_dims; + return TensorTraceOp, const Derived>(derived(), in_dims); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorBroadcastingOp - broadcast(const Broadcast& broadcast) const { - return TensorBroadcastingOp(derived(), broadcast); + broadcast(const Broadcast& bcast) const { + return TensorBroadcastingOp(derived(), bcast); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -778,8 +921,8 @@ class TensorBase } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorStridingOp @@ -820,7 +963,8 @@ class TensorBase protected: template friend class Tensor; template friend class TensorFixedSize; - template friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } }; @@ -828,6 +972,7 @@ class TensorBase template::value> class TensorBase : public TensorBase { public: + typedef TensorBase Base; typedef internal::traits DerivedTraits; typedef typename DerivedTraits::Scalar Scalar; typedef typename DerivedTraits::Index Index; @@ -836,7 +981,8 @@ class TensorBase : public TensorBase { template friend class Tensor; template friend class TensorFixedSize; - template friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setZero() { @@ -974,13 +1120,13 @@ class TensorBase : public TensorBase { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp - shuffle(const Shuffle& shuffle) { - return TensorShufflingOp(derived(), shuffle); + shuffle(const Shuffle& shfl) { + return TensorShufflingOp(derived(), shfl); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -996,11 +1142,29 @@ class TensorBase : public TensorBase { // Select the device on which to evaluate the expression. template - TensorDevice device(const DeviceType& device) { - return TensorDevice(device, derived()); + TensorDevice device(const DeviceType& dev) { + return TensorDevice(dev, derived()); + } + + // Select the async device on which to evaluate the expression. + template + TensorAsyncDevice device(const DeviceType& dev, DoneCallback done) { + return TensorAsyncDevice(dev, derived(), std::move(done)); } protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase) + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase) + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(derived(), other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return derived(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } EIGEN_DEVICE_FUNC diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h new file mode 100644 index 00000000..1e55d12c --- /dev/null +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -0,0 +1,1559 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H + +namespace Eigen { +namespace internal { + +// -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template +class TensorBlockIO; + +// -------------------------------------------------------------------------- // +// Helper function to compute strides for densely stored buffer of given +// dimensions. + +// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use +// this function instead everywhere. +template +EIGEN_ALWAYS_INLINE DSizes strides( + const DSizes& dimensions) { + DSizes strides; + if (NumDims == 0) return strides; + + // TODO(ezhulenev): Use templates to unroll this loop (similar to + // h_array_reduce in CXX11meta.h)? Benchmark it. + if (static_cast(Layout) == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dimensions[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dimensions[i + 1]; + } + } + + return strides; +} + +template +EIGEN_ALWAYS_INLINE DSizes strides( + const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + +template +EIGEN_STRONG_INLINE DSizes strides( + const Sizes& sizes) { + return strides(DSizes(sizes)); +} + +// -------------------------------------------------------------------------- // + +// Tensor block shape type defines what are the shape preference for the blocks +// extracted from the larger tensor. +// +// Example: blocks of 100 elements from the large 100x100 tensor: +// - tensor: 100x100 +// - target_block_size: 100 +// +// TensorBlockShapeType: +// - kUniformAllDims: 100 blocks of size 10x10 +// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column +// or row major layout) +enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; + +struct TensorBlockResourceRequirements { + TensorBlockShapeType shape_type; // target block shape + size_t size; // target block size + TensorOpCost cost_per_coeff; // cost of computing a single block element + +#ifdef EIGEN_HIPCC + // For HIPCC, we need to explicitly declare as a "device fun", the constructor + // which is implicitly invoked in the "merge" / "any" routines. else HIPCC + // errors out complaining about the lack of a matching constructor + EIGEN_DEVICE_FUNC + TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, + TensorOpCost cost_) + : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) + {} +#endif + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes, + TensorOpCost cost) { + const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); + return {shape_type, size, cost}; + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes) { + // This default cost per coefficient is valid for most materialized tensor + // block evaluation implementations, because they typically just read + // coefficients from the underlying tensor storage, and write to the tensor + // block buffer (scratch or destination memory, reads and writes have linear + // access pattern). We ignore the fixed cost of block evaluation, because in + // practice it should negligible. + // + // Lazy block evaluation adds the cost of calling a functor for each + // coefficient. + // + // All non-trivial block evaluation implementations must provide their own + // cost approximation (e.g. shuffling inner dimension has a much higher cost + // because it reads memory randomly, although the total number of moved + // bytes is the same). + return withShapeAndSize(shape_type, size_in_bytes, + {/*bytes_loaded=*/sizeof(Scalar), + /*bytes_stored=*/sizeof(Scalar), + /*compute_cycles=*/0}); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( + size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, + size_in_bytes); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( + size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, + size_in_bytes); + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements + merge(const TensorBlockResourceRequirements& lhs, + const TensorBlockResourceRequirements& rhs) { + return {merge(lhs.shape_type, rhs.shape_type), // shape_type + merge(lhs.size, rhs.size), // size + merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff + } + + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( + TensorOpCost cost) { + cost_per_coeff += cost; + return *this; + } + + // This is a resource requirement that should be returned from expressions + // that do not have any block evaluation preference (e.g. default tensor + // expression with raw buffer access). + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { + return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; + } + + private: + using Requirements = TensorBlockResourceRequirements; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { + return numext::maxi(lhs_size, rhs_size); + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockShapeType + merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { + return (lhs == TensorBlockShapeType::kSkewedInnerDims || + rhs == TensorBlockShapeType::kSkewedInnerDims) + ? TensorBlockShapeType::kSkewedInnerDims + : TensorBlockShapeType::kUniformAllDims; + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, + TensorOpCost rhs_cost) { + return lhs_cost + rhs_cost; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockDescriptor specifies a block offset within a tensor and the block +// sizes along each of the tensor dimensions. + +template +class TensorBlockDescriptor { + public: + typedef DSizes Dimensions; + + // If we evaluate a Tensor assignment, and expression on the left, already has + // a memory buffer, then we might do performance optimization, and evaluate + // the root expression directly into the final output memory. Some time it's + // possible to reuse it for materializing subexpressions inside an expression + // tree, to to avoid dynamic memory allocation. + // + // The pointer type of the underlying storage is erased, because passing + // Scalar type through all the expression evaluation layers is way too many + // templates. In practice destination buffer type should always match the + // evaluated expression scalar type. + class DestinationBuffer { + public: + enum DestinationBufferKind : int { + // The above explicit specification of "int" as the enum basetype is + // needed to get around a HIPCC link error ("the field type is not + // amp-compatible") + // which is issued for class members with the enum type. + // TODO(rocm): + // remove the "int" basetype once HIPCC has been fixed to not error out + // in the above scenario. + + // Destination buffer is not defined (`m_data` == nullptr). + kEmpty, + + // Tensor block defined by an owning tensor block descriptor can fit + // contiguously into the destination buffer. In this case it's safe to + // materialize tensor block in the destination buffer, wrap it in a + // TensorMap, and use to build Eigen expression on top of it. + kContiguous, + + // Destination buffer strides do not match strides of the contiguously + // stored block, and it's impossible to define a TensorMap over this + // buffer. However if we are evaluating a root of an expression tree, we + // still can materialize an output into this destination, because we can + // guarantee that no one will ever access it through block API. + // + // In theory it is possible to build valid TensorStriding + // expression on top of this destination buffer, however it has + // inefficient coeff/packet access, and defeats the purpose of fast block + // evaluation API. + kStrided + }; + + template + Scalar* data() const { + eigen_assert(m_data_type_size == sizeof(Scalar)); + return static_cast(m_data); + } + + const Dimensions& strides() const { return m_strides; } + const DestinationBufferKind& kind() const { return m_kind; } + + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} + + template + DestinationBuffer(Scalar* data, const Dimensions& strides, + DestinationBufferKind kind) + : m_data(static_cast(data)), + m_data_type_size(sizeof(Scalar)), + m_strides(strides), + m_kind(kind) {} + + template + static DestinationBuffer make(const TensorBlockDescriptor& desc, + Scalar* data, const Dimensions& strides) { + return DestinationBuffer(data, strides, kind(desc, strides)); + } + + template + static DestinationBufferKind kind(const TensorBlockDescriptor& desc, + const Dimensions& strides) { + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& desc_strides = internal::strides(desc_dims); + for (int i = 0; i < NumDims; ++i) { + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != strides[i]) return kStrided; + } + return kContiguous; + } + + // Storage pointer is type erased, to reduce template bloat, but we still + // keep the size of the underlying element type for error checking. + void* m_data; + size_t m_data_type_size; + + // Destination buffer dimensions always match the dimensions of a tensor + // block descriptor it belongs to, however strides might be different. + Dimensions m_strides; + + DestinationBufferKind m_kind; + }; + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, + const DestinationBuffer& destination) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(destination) {} + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(DestinationBuffer()) {} + + IndexType offset() const { return m_offset; } + const Dimensions& dimensions() const { return m_dimensions; } + IndexType dimension(int index) const { return m_dimensions[index]; } + IndexType size() const { return array_prod(m_dimensions); } + + const DestinationBuffer& destination() const { return m_destination; } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { + eigen_assert(dst_base != NULL); + m_destination = + DestinationBuffer::template make(*this, dst_base, dst_strides); + } + + template + void AddDestinationBuffer( + Scalar* dst_base, + const DSizes& dst_strides) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides)); + } + + TensorBlockDescriptor& DropDestinationBuffer() { + m_destination.m_data = NULL; + m_destination.m_kind = DestinationBuffer::kEmpty; + return *this; + } + + bool HasDestinationBuffer() const { + return m_destination.kind() != DestinationBuffer::kEmpty; + } + + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + + private: + // Offset and dimensions are immutable after construction. Block descriptor + // can only be mutated by adding or dropping destination. + const IndexType m_offset; + const Dimensions m_dimensions; + DestinationBuffer m_destination; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockMapper is responsible for iterating over the blocks of a tensor. + +template +class TensorBlockMapper { + typedef TensorBlockDescriptor BlockDescriptor; + + public: + typedef DSizes Dimensions; + + TensorBlockMapper() = default; + TensorBlockMapper(const DSizes& dimensions, + const TensorBlockResourceRequirements& requirements) + : m_tensor_dimensions(dimensions), m_requirements(requirements) { + // Compute block dimensions and the total number of blocks. + InitializeBlockDimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { + return m_total_block_count; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { + return m_block_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& + blockDimensions() const { + return m_block_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor + blockDescriptor(IndexType block_index) const { + static const bool isColMajor = Layout == static_cast(ColMajor); + + IndexType offset = 0; + DSizes dimensions; + + if (NumDims == 0) return BlockDescriptor(offset, dimensions); + + // Iterate outer -> inner dimensions. + for (int i = NumDims - 1; i >= 0; --i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + const IndexType idx = block_index / m_block_strides[dim]; + block_index -= idx * m_block_strides[dim]; + + const IndexType coord = idx * m_block_dimensions[dim]; + dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, + m_block_dimensions[dim]); + offset += coord * m_tensor_strides[dim]; + } + + return {offset, dimensions}; + } + + private: + void InitializeBlockDimensions() { + // Requested block shape and size. + const TensorBlockShapeType shape_type = m_requirements.shape_type; + IndexType target_block_size = + numext::maxi(1, static_cast(m_requirements.size)); + + IndexType tensor_size = m_tensor_dimensions.TotalSize(); + + // Corner case: one of the dimensions is zero. Logic below is too complex + // to handle this case on a general basis, just use unit block size. + // Note: we must not yield blocks with zero dimensions (recipe for + // overflows/underflows, divisions by zero and NaNs later). + if (tensor_size == 0) { + for (int i = 0; i < NumDims; ++i) { + m_block_dimensions[i] = 1; + } + m_total_block_count = 0; + return; + } + + // If tensor fits into a target block size, evaluate it as a single block. + if (tensor_size <= target_block_size) { + m_block_dimensions = m_tensor_dimensions; + m_total_block_count = 1; + // The only valid block index is `0`, and in this case we do not need + // to compute real strides for tensor or blocks (see blockDescriptor). + for (int i = 0; i < NumDims; ++i) { + m_tensor_strides[i] = 0; + m_block_strides[i] = 1; + } + return; + } + + static const bool isColMajor = Layout == static_cast(ColMajor); + + // Block shape skewed towards inner dimension. + if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { + IndexType coeff_to_allocate = target_block_size; + + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + m_block_dimensions[dim] = + numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); + coeff_to_allocate = divup( + coeff_to_allocate, + numext::maxi(static_cast(1), m_block_dimensions[dim])); + } + eigen_assert(coeff_to_allocate == 1); + + } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { + // Tensor will not fit within 'target_block_size' budget: calculate tensor + // block dimension sizes based on "square" dimension size target. + const IndexType dim_size_target = convert_index( + std::pow(static_cast(target_block_size), + 1.0f / static_cast(m_block_dimensions.rank()))); + + for (int i = 0; i < NumDims; ++i) { + // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it + // a multiple of the packet size. Note that reducing + // 'block_dim_size' in this manner can increase the number of + // blocks, and so will amplify any per-block overhead. + m_block_dimensions[i] = + numext::mini(dim_size_target, m_tensor_dimensions[i]); + } + + // Add any un-allocated coefficients to inner dimension(s). + IndexType total_size = m_block_dimensions.TotalSize(); + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { + const IndexType total_size_other_dims = + total_size / m_block_dimensions[dim]; + const IndexType alloc_avail = + divup(target_block_size, total_size_other_dims); + if (alloc_avail == m_block_dimensions[dim]) { + // Insufficient excess coefficients to allocate. + break; + } + m_block_dimensions[dim] = + numext::mini(m_tensor_dimensions[dim], alloc_avail); + total_size = total_size_other_dims * m_block_dimensions[dim]; + } + } + + } else { + eigen_assert(false); // unknown block shape + } + + eigen_assert(m_block_dimensions.TotalSize() >= + numext::mini(target_block_size, + m_tensor_dimensions.TotalSize())); + + // Calculate block counts by dimension and total block count. + DSizes block_count; + for (int i = 0; i < NumDims; ++i) { + block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + m_tensor_strides = strides(m_tensor_dimensions); + m_block_strides = strides(block_count); + } + + DSizes m_tensor_dimensions; + TensorBlockResourceRequirements m_requirements; + + DSizes m_block_dimensions; + IndexType m_total_block_count; + + DSizes m_tensor_strides; + DSizes m_block_strides; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockScratchAllocator is responsible for allocating temporary buffers +// for block evaluation (output or input block materialization). Given that +// Eigen expression traversal order is deterministic, all temporary allocations +// are happening in the same order, and usually have exactly the same size. +// Scratch allocator keeps a trace of all dynamic allocations, and after the +// first block evaluation is completed, we should be able to reuse all the +// temporary buffers for the next block evaluation. + +template +class TensorBlockScratchAllocator { + public: + explicit TensorBlockScratchAllocator(const Device& device) + : m_device(device), m_allocation_index(0) {} + + ~TensorBlockScratchAllocator() { + for (size_t i = 0; i < m_allocations.size(); ++i) { + m_device.deallocate(m_allocations[i].ptr); + } + } + + void* allocate(size_t size) { + // TODO(ezhulenev): Remove when replaced with inlined vector. + if (m_allocations.capacity() == 0) m_allocations.reserve(8); + + // Check if we already have an existing allocation att current index. + const int num_allocations = static_cast(m_allocations.size()); + const bool has_allocation = m_allocation_index < num_allocations; + + // Allocation index can't be larger than the number of allocations. + eigen_assert(m_allocation_index <= num_allocations); + + // If we have existing allocation, and its size is larger or equal to + // requested size, we do nothing. + + // If current allocation can't fit requested size, we deallocate it, and + // replace with a larger allocation. + if (has_allocation && m_allocations[m_allocation_index].size < size) { + m_device.deallocate(m_allocations[m_allocation_index].ptr); + m_allocations[m_allocation_index].ptr = m_device.allocate(size); + m_allocations[m_allocation_index].size = size; + } + + // Make a new allocation if we don't have and existing one. + if (!has_allocation) { + Allocation allocation; + allocation.ptr = m_device.allocate(size); + allocation.size = size; + m_allocations.push_back(allocation); + } + + eigen_assert(m_allocations[m_allocation_index].ptr != NULL); + eigen_assert(m_allocations[m_allocation_index].size >= size); + + return m_allocations[m_allocation_index++].ptr; + } + + void reset() { m_allocation_index = 0; } + + private: + struct Allocation { + void* ptr; + size_t size; + }; + + const Device& m_device; + int m_allocation_index; + // TODO(ezhulenev): This should be an inlined vector. + std::vector m_allocations; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockKind represents all possible block kinds, that can be produced by +// TensorEvaluator::evalBlock function. +enum TensorBlockKind { + // Tensor block that is a lazy expression that must be assigned to a + // destination using TensorBlockAssign. + kExpr, + + // Tensor block that is a view into a memory buffer owned by an underlying + // Tensor expression (e.g. it can be a view into a Tensor buffer). + kView, + + // Tensor block that was materialized in a scratch memory buffer, allocated + // with TensorBlockScratchAllocator. This block must be copied to a + // destination, similar to a block of `kExpr` type. + kMaterializedInScratch, + + // Tensor block that was materialized directly into the final output memory + // buffer. For example if the left side of an assignment is a Tensor, we can + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. + kMaterializedInOutput +}; + +// -------------------------------------------------------------------------- // +// TensorBlockNotImplemented should be used to defined TensorBlock typedef in +// TensorEvaluators that do not support block evaluation. + +class TensorBlockNotImplemented { + public: + typedef void XprType; +}; + +// -------------------------------------------------------------------------- // +// XprScalar extracts Scalar type from the Eigen expressions (if expression type +// is not void). It's required to be able to define lazy block expression for +// argument types, that do not support block evaluation. + +template +struct XprScalar { + typedef typename XprType::Scalar type; +}; +template <> +struct XprScalar { + typedef void type; +}; + +// -------------------------------------------------------------------------- // +// TensorMaterializedBlock is a fully evaluated block of the original tensor, +// and XprType is just a TensorMap over the data. This block type is typically +// used to materialize blocks of tensor expressions, that can't be efficiently +// represented as lazy Tensor expressions with fast coeff/packet operations, +// e.g. we materialize all broadcasts into evaluated blocks. +// +// TensorMaterializedBlock does not own its memory buffer, it's either a memory +// buffer that backs the original expression (e.g. block is just a view into a +// Tensor), or a memory buffer allocated with scratch allocator, and in this +// case the scratch allocator will deallocate it at the end of block based +// expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. + +template +class TensorMaterializedBlock { + public: + typedef DSizes Dimensions; + typedef TensorMap > XprType; + + TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, + const Dimensions& dimensions, bool valid_expr = true) + : m_kind(kind), + m_data(data), + m_dimensions(dimensions), + m_expr(m_data, m_dimensions), + m_valid_expr(valid_expr) { + eigen_assert(m_kind == internal::TensorBlockKind::kView || + m_kind == internal::TensorBlockKind::kMaterializedInScratch || + m_kind == internal::TensorBlockKind::kMaterializedInOutput); + } + + TensorBlockKind kind() const { return m_kind; } + // NOTE(ezhulenev): Returning XprType by value like in other block types + // causes asan failures. The theory is that XprType::Nested doesn't work + // properly for TensorMap. + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } + const Scalar* data() const { return m_data; } + void cleanup() {} + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + + // TensorMaterializedBlock can be backed by different types of storage: + // + // (1) Contiguous block of memory allocated with scratch allocator. + // (2) Contiguous block of memory reused from tensor block descriptor + // destination buffer. + // (3) Strided block of memory reused from tensor block descriptor + // destination buffer. + // + class Storage { + public: + Scalar* data() const { return m_data; } + const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& strides() const { return m_strides; } + + TensorMaterializedBlock AsTensorMaterializedBlock() const { + return TensorMaterializedBlock( + m_materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + m_data, m_dimensions, !m_strided_storage); + } + + private: + friend class TensorMaterializedBlock; + + Storage(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, bool materialized_in_output, + bool strided_storage) + : m_data(data), + m_dimensions(dimensions), + m_strides(strides), + m_materialized_in_output(materialized_in_output), + m_strided_storage(strided_storage) {} + + Scalar* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + bool m_materialized_in_output; + bool m_strided_storage; + }; + + // Creates a storage for materialized block either from the block descriptor + // destination buffer, or allocates a new buffer with scratch allocator. + template + EIGEN_STRONG_INLINE static Storage prepareStorage( + TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool allow_strided_storage = false) { + // Try to reuse destination as an output block buffer. + typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; + + if (desc.destination().kind() == DestinationBuffer::kContiguous) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/true, + /*strided_storage=*/false); + + } else if (desc.destination().kind() == DestinationBuffer::kStrided && + allow_strided_storage) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), desc.destination().strides(), + /*materialized_in_output=*/true, /*strided_storage=*/true); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + return Storage(static_cast(mem), desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/false, + /*strided_storage=*/false); + } + } + + // Creates a materialized block for the given descriptor from a memory buffer. + template + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( + const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(internal::TensorBlockKind::kView, + block_start, desc.dimensions()); + + } else { + // Reuse destination buffer or allocate new buffer with scratch allocator. + const Storage storage = prepareStorage(desc, scratch); + + typedef internal::TensorBlockIO + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), + data, desc.offset()); + TensorBlockIODst dst(storage.dimensions(), storage.strides(), + storage.data()); + + TensorBlockIO::Copy(dst, src); + return storage.AsTensorMaterializedBlock(); + } + } + + private: + TensorBlockKind m_kind; + const Scalar* m_data; + Dimensions m_dimensions; + XprType m_expr; + bool m_valid_expr; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseUnaryBlock { + static const bool NoArgBlockAccess = + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseUnaryOp >:: + type XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) + : m_arg_block(arg_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + UnaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseBinaryBlock { + static const bool NoArgBlockAccess = + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseBinaryOp >::type + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, + const RhsTensorBlock& right_block, + const BinaryOp& functor) + : m_left_block(left_block), + m_right_block(right_block), + m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { + return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); + } + + const Scalar* data() const { return NULL; } + + void cleanup() { + m_left_block.cleanup(); + m_right_block.cleanup(); + } + + private: + LhsTensorBlock m_left_block; + RhsTensorBlock m_right_block; + BinaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template +class TensorUnaryExprBlock { + typedef typename ArgTensorBlock::XprType ArgXprType; + static const bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, + const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template +class TensorTernaryExprBlock { + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static const bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, + const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, + const BlockFactory& factory) + : m_arg1_block(arg1_block), + m_arg2_block(arg2_block), + m_arg3_block(arg3_block), + m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { + return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), + m_arg3_block.expr()); + } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// StridedLinearBufferCopy provides a method to copy data between two linear +// buffers with different strides, with optimized paths for scatter/gather. + +template +class StridedLinearBufferCopy { + typedef typename packet_traits::type Packet; + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + public: + // Specifying linear copy kind statically gives ~30% speedup for small sizes. + enum class Kind { + Linear = 0, // src_stride == 1 && dst_stride == 1 + Scatter = 1, // src_stride == 1 && dst_stride != 1 + FillLinear = 2, // src_stride == 0 && dst_stride == 1 + FillScatter = 3, // src_stride == 0 && dst_stride != 1 + Gather = 4, // dst_stride == 1 + Random = 5 // everything else + }; + + struct Dst { + Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + Scalar* data; + }; + + struct Src { + Src(IndexType o, IndexType s, const Scalar* d) + : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + const Scalar* data; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, + const Src& src, + const size_t count) { + Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, + src.data); + } + + private: + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const IndexType count, const IndexType dst_offset, + const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const IndexType src_offset, const IndexType src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src = &src_data[src_offset]; + Scalar* dst = &dst_data[dst_offset]; + + if (!Vectorizable) { + for (Index i = 0; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } + + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + if (kind == StridedLinearBufferCopy::Kind::Linear) { + // ******************************************************************** // + // Linear copy from `src` to `dst`. + const IndexType unrolled_size = count - 4 * PacketSize; + eigen_assert(src_stride == 1 && dst_stride == 1); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + Packet p = ploadu(src + i + j * PacketSize); + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { + // Scatter from `src` to `dst`. + eigen_assert(src_stride == 1 && dst_stride != 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { + // Fill `dst` with value at `*src`. + eigen_assert(src_stride == 0 && dst_stride == 1); + const IndexType unrolled_size = count - 4 * PacketSize; + Packet p = pload1(src); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = *src; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { + // Scatter `*src` into `dst`. + eigen_assert(src_stride == 0 && dst_stride != 1); + Packet p = pload1(src); + for (; i <= vectorized_size; i += PacketSize) { + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = *src; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Gather) { + // Gather from `src` into `dst`. + eigen_assert(dst_stride == 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i * src_stride]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Random) { + // Random. + for (; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } else { + eigen_assert(false); + } + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. +// It's possible to specify src->dst dimension mapping for the copy operation. +// Dimensions of `dst` specify how many elements have to be copied, for the +// `src` we need to know only stride to navigate through source memory buffer. + +template +class TensorBlockIO { + static const bool IsColMajor = (Layout == ColMajor); + + typedef StridedLinearBufferCopy LinCopy; + + public: + typedef DSizes Dimensions; + typedef DSizes DimensionsMap; + + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, + IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + struct Src { + Src(const Dimensions& src_strides, const Scalar* src, + IndexType src_offset = 0) + : strides(src_strides), data(src), offset(src_offset) {} + + Dimensions strides; + const Scalar* data; + IndexType offset; + }; + + // Copies data to `dst` from `src`, using provided dimensions mapping: + // + // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] + // + // Returns the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( + const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { + // Copy single scalar value from `src` to `dst`. + if (NumDims == 0) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Both `dst` and `src` must have contiguous innermost dimension. We also + // accept the special case with stride '0', because it's used as a trick to + // implement broadcasting. + { + int inner_dim = IsColMajor ? 0 : NumDims - 1; + EIGEN_UNUSED_VARIABLE(inner_dim); + eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); + eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); + } + + // Give a shorter name to `dst_to_src_dim_map`. + const DimensionsMap& dim_map = dst_to_src_dim_map; + + // Do not squeeze reordered inner dimensions. + int num_squeezable_dims = NumSqueezableInnerDims(dim_map); + + // NOTE: We find the innermost dimension (contiguous in memory) in the dst + // block, and we write data linearly into that dimension, reading it from + // the src. If dimensions are reordered, we might end up reading data from + // the src with `stride != 1`. + // + // NOTE: Random-Read/Linear-Write can be up to ~2X faster than + // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 + + // Find the innermost dimension in the dst whose size is not 1. This is the + // effective inner dim. + int num_size_one_inner_dims = 0; + for (int i = 0; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + if (dst.dims[dst_dim] != 1) break; + num_size_one_inner_dims++; + } + + // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. + if (num_size_one_inner_dims == NumDims) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Outermost dimension in the dst with `stride == 1` (contiguous in memory). + const int dst_stride1_dim = IsColMajor + ? num_size_one_inner_dims + : NumDims - num_size_one_inner_dims - 1; + + // Dimension in the src that corresponds to the dst innermost dimension. + const int src_dim_for_dst_stride1_dim = + NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; + + // Size of the innermost dimension (length of contiguous blocks of memory). + IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; + + // Squeeze multiple inner dims into one if they are contiguous in `dst` and + // `src` memory, so we can do less linear copy calls. + for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dst_dim]; + const IndexType src_stride = src.strides[dim_map[dst_dim]]; + if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { + dst_inner_dim_size *= dst.dims[dst_dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + // Setup strides to read data from `src` and write to `dst`. + IndexType input_offset = src.offset; + IndexType output_offset = dst.offset; + IndexType input_stride = + NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; + IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array it; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int idx = 0; // currently initialized iterator state index + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; + if (dst.dims[dst_dim] == 1) continue; + + it[idx].size = dst.dims[dst_dim]; + it[idx].input_stride = src.strides[dim_map[dst_dim]]; + it[idx].output_stride = dst.strides[dst_dim]; + + it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + + idx++; + } + + // Iterate copying data from src to dst. + const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + +#define COPY_INNER_DIM(KIND) \ + IndexType num_copied = 0; \ + for (num_copied = 0; num_copied < block_total_size; \ + num_copied += dst_inner_dim_size) { \ + LinCopy::template Run( \ + typename LinCopy::Dst(output_offset, output_stride, dst.data), \ + typename LinCopy::Src(input_offset, input_stride, src.data), \ + dst_inner_dim_size); \ + \ + for (int j = 0; j < idx; ++j) { \ + if (++it[j].count < it[j].size) { \ + input_offset += it[j].input_stride; \ + output_offset += it[j].output_stride; \ + break; \ + } \ + it[j].count = 0; \ + input_offset -= it[j].input_span; \ + output_offset -= it[j].output_span; \ + } \ + } \ + return num_copied; + + if (input_stride == 1 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Linear); + } else if (input_stride == 1 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::Scatter); + } else if (input_stride == 0 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::FillLinear); + } else if (input_stride == 0 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::FillScatter); + } else if (output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Gather); + } else { + COPY_INNER_DIM(LinCopy::Kind::Random); + } + +#undef COPY_INNER_DIM + } + + // Copy from `src` to `dst` with an identity src->dst dimension map. Returns + // the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, + const Src& src) { + DimensionsMap dst_to_src_map; + for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; + return Copy(dst, src, dst_to_src_map); + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), + count(0), + input_stride(0), + output_stride(0), + input_span(0), + output_span(0) {} + + IndexType size; + IndexType count; + IndexType input_stride; + IndexType output_stride; + IndexType input_span; + IndexType output_span; + }; + + // Compute how many inner dimensions it's allowed to squeeze when doing IO + // between two tensor blocks. It's safe to squeeze inner dimensions, only + // if they are not reordered. + static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { + int num_squeezable_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + if (dim_map[dim] != dim) break; + num_squeezable_dims++; + } + return num_squeezable_dims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. +// +// Currently there is no way to write from a Tensor expression to a block of +// memory, if dimensions are reordered. If you need to do that, you should +// materialize a Tensor block expression into a memory buffer, and then use +// TensorBlockIO to copy data between two memory buffers with a custom +// `target->src` dimension map (see definition above). +// +// Also currently the innermost dimension of `target` must have a stride '1' +// (contiguous in memory). This restriction could be lifted with a `pscatter`, +// but in practice it's never needed, and there is a similar TensorBlockIO +// workaround for that. +// +// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO +// where `src` is a tensor expression. Explore if it is possible to rewrite IO +// to use expressions instead of pointers, and after that TensorBlockAssignment +// will become an alias to IO. +template +class TensorBlockAssignment { + // We will use coeff/packet path to evaluate block expressions. + typedef TensorEvaluator + TensorBlockEvaluator; + + typedef DSizes Dimensions; + + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + for (IndexType i = 0; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + typedef typename packet_traits::type Packet; + + const IndexType unrolled_size = count - 4 * PacketSize; + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + const IndexType idx = eval_offset + i + j * PacketSize; + Packet p = eval.template packet(idx); + pstoreu(target + i + j * PacketSize, p); + } + } + + for (; i <= vectorized_size; i += PacketSize) { + Packet p = eval.template packet(eval_offset + i); + pstoreu(target + i, p); + } + + for (; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + public: + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, + Scalar* target_data, IndexType target_offset = 0) + : dims(target_dims), + strides(target_strides), + data(target_data), + offset(target_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + static Target target(const Dimensions& target_dims, + const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template + static Target target( + const DSizes& target_dims, + const DSizes& target_strides, + Scalar* target_data, IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), + target_data, target_offset); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Target& target, const TensorBlockExpr& expr) { + // Prepare evaluator for block expression. + DefaultDevice default_device; + TensorBlockEvaluator eval(expr, default_device); + + // Tensor block expression dimension should match destination dimensions. + eigen_assert(dimensions_match(target.dims, eval.dimensions())); + + static const int Layout = TensorBlockEvaluator::Layout; + static const bool is_col_major = Layout == ColMajor; + + // Initialize output inner dimension size based on a layout. + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); + const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; + + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); + + // Squeeze multiple inner dims into one if they are contiguous in `target`. + IndexType num_squeezed_dims = 0; + for (Index i = 1; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - i - 1; + const IndexType target_stride = target.strides[dim]; + + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; + num_squeezed_dims++; + } else { + break; + } + } + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + + int idx = 0; // currently initialized iterator state index + for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { + const Index dim = is_col_major ? i + 1 : NumDims - i - 2; + + it[idx].count = 0; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // We read block expression from the beginning, and start writing data to + // `target` at given offset. + IndexType input_offset = 0; + IndexType output_offset = target.offset; + + // Iterate copying data from `eval` to `target`. + for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { + // Assign to `target` at current offset. + InnerDimAssign::Run(target.data + output_offset, + output_inner_dim_size, eval, + input_offset); + + // Move input offset forward by the number of assigned coefficients. + input_offset += output_inner_dim_size; + + // Update index. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), size(0), output_stride(0), output_span(0) {} + + IndexType count; + IndexType size; + IndexType output_stride; + IndexType output_span; + }; +}; + +// -------------------------------------------------------------------------- // + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 4cfe300e..a354132f 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -31,12 +31,13 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorBroadcastingOp& type; + typedef const TensorBroadcastingOp EIGEN_DEVICE_REF type; }; template @@ -54,7 +55,7 @@ struct is_input_scalar > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H -template +template struct is_input_scalar > { static const bool value = (Sizes::total_size == 1); }; @@ -103,27 +104,57 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions InputDimensions; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout; + bool isCopy, nByOne, oneByN; + public: + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) + typedef typename internal::remove_const::type ScalarNoConst; + + // We do block based broadcasting using a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + typedef DSizes BroadcastDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : isCopy(false), nByOne(false), oneByN(false), + m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device) { + // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); - const Broadcast& broadcast = op.broadcast(); + isCopy = true; for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i] * broadcast[i]; + m_dimensions[i] = input_dims[i] * m_broadcast[i]; + if (m_broadcast[i] != 1) { + isCopy = false; + } } if (static_cast(Layout) == static_cast(ColMajor)) { @@ -141,16 +172,58 @@ struct TensorEvaluator, Device> m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; } } + + if (input_dims[0] == 1) { + oneByN = true; + for (int i = 1; i < NumDims; ++i) { + if (m_broadcast[i] != 1) { + oneByN = false; + break; + } + } + } else if (input_dims[NumDims-1] == 1) { + nByOne = true; + for (int i = 0; i < NumDims-1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + break; + } + } + } + + // Handle special format like NCHW, its input shape is '[1, N..., 1]' and + // broadcast shape is '[N, 1..., N]' + if (!oneByN && !nByOne) { + if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { + nByOne = true; + oneByN = true; + for (int i = 1; i < NumDims-1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + oneByN = false; + break; + } + } + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -161,16 +234,24 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - return coeffColMajor(index); + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffColMajor(index); + } } else { - return coeffRowMajor(index); + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffRowMajor(index); + } } } // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -195,12 +276,17 @@ struct TensorEvaluator, Device> inputIndex += (index % m_impl.dimensions()[0]); } } - return m_impl.coeff(inputIndex); + return inputIndex; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -215,17 +301,22 @@ struct TensorEvaluator, Device> } index -= idx * m_outputStrides[i]; } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); inputIndex += index; } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); } } - return m_impl.coeff(inputIndex); + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + return m_impl.coeff(indexRowMajor(index)); } template @@ -236,9 +327,148 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); + if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet(index); + #else + return m_impl.template packet(index); + #endif + } else if (oneByN && !nByOne) { + return packetNByOne(index); + } else if (!oneByN && nByOne) { + return packetOneByN(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetColMajor(index); + } } else { - return packetRowMajor(index); + if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet(index); + #else + return m_impl.template packet(index); + #endif + } else if (oneByN && !nByOne) { + return packetOneByN(index); + } else if (!oneByN && nByOne) { + return packetNByOne(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetRowMajor(index); + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne + (Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index startDim, endDim; + Index inputIndex, outputOffset, batchedIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + startDim = NumDims - 1; + endDim = 1; + } else { + startDim = 0; + endDim = NumDims - 2; + } + + batchedIndex = index % m_outputStrides[startDim]; + inputIndex = batchedIndex / m_outputStrides[endDim]; + outputOffset = batchedIndex % m_outputStrides[endDim]; + + if (outputOffset + PacketSize <= m_outputStrides[endDim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[endDim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + ++inputIndex; + inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); + values[i] = m_impl.coeff(inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index dim, inputIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = NumDims - 1; + } else { + dim = 0; + } + + inputIndex = index % m_inputStrides[dim]; + if (inputIndex + PacketSize <= m_inputStrides[dim]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + if (inputIndex > m_inputStrides[dim]-1) { + inputIndex = 0; + } + values[i] = m_impl.coeff(inputIndex++); + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index dim, inputIndex, outputOffset; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = 1; + } else { + dim = NumDims - 2; + } + + inputIndex = index / m_outputStrides[dim]; + outputOffset = index % m_outputStrides[dim]; + if (outputOffset + PacketSize <= m_outputStrides[dim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[dim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + values[i] = m_impl.coeff(++inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); } } @@ -253,6 +483,7 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -288,8 +519,13 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffColMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[0]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffColMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -305,6 +541,7 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -340,8 +577,13 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffRowMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffRowMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -351,7 +593,8 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { double compute_cost = TensorOpCost::AddCost(); - if (NumDims > 0) { + if (!isCopy && NumDims > 0) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost(); if (internal::index_statically_eq(i, 1)) { @@ -372,14 +615,472 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + const size_t target_size = m_device.firstLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + m_impl.getResourceRequirements(), + internal::TensorBlockResourceRequirements::skewed(target_size)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + BlockBroadcastingParams params = blockBroadcastingParams(desc); + + if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) { + return emptyBlock(); + } + + // Prepare storage for the materialized broadcasting result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + ScalarNoConst* materialized_output = block_storage.data(); + + // We potentially will need to materialize input blocks. + size_t materialized_input_size = 0; + ScalarNoConst* materialized_input = NULL; + + // Initialize block broadcating iterator state for outer dimensions (outer + // with regard to bcast dimension). Dimension in this array are always in + // inner_most -> outer_most order (col major layout). + array it; + int idx = 0; + + for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { + const Index dim = IsColMajor ? i : NumDims - 1 - i; + it[idx].size = params.output_dims[dim]; + it[idx].count = 0; + it[idx].output_stride = m_outputStrides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // Write output into the beginning of `materialized_output`. + Index output_offset = 0; + + // We will fill output block by broadcasting along the bcast dim, and + // iterating over outer dimension. + const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize(); + + for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) { + ScalarNoConst* bcast_output = materialized_output + num_output_coeffs; + Index bcast_offset = desc.offset() + output_offset; + + // Broadcast along the bcast dimension. + num_output_coeffs += BroadcastBlockAlongBcastDim( + params, bcast_offset, scratch, bcast_output, &materialized_input, + &materialized_input_size); + + // Switch to the next outer dimension. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( + cl::sycl::handler& cgh) const { + m_impl.bind(cgh); + } +#endif + private: + static const bool IsColMajor = + static_cast(Layout) == static_cast(ColMajor); + + // We will build a general case block broadcasting on top of broadcasting + // primitive that will do broadcasting only for the inner dimension(s) along + // the first dimension smaller than the input size (it's called `bcast_dim`). + // + // Example: + // dim: 0 1 2 (ColMajor) + // input size: [9, 3, 6] + // block size: [9, 2, 6] + // + // We will compute broadcasted block by iterating over the outer dimensions + // before `bcast_dim` (only dimension `2` in this example) and computing + // broadcasts along the `bcast_dim` (dimension `1` in this example). + + // BlockBroadcastingParams holds precomputed parameters for broadcasting a + // single block along the broadcasting dimension. Sizes and strides along the + // `bcast_dim` might be invalid, they will be adjusted later in + // `BroadcastBlockAlongBcastDim`. + struct BlockBroadcastingParams { + Dimensions input_dims; // input expression dimensions + Dimensions output_dims; // output block sizes + Dimensions output_strides; // output block strides + + int inner_dim_count; // count inner dimensions matching in size + int bcast_dim; // broadcasting dimension index + Index bcast_dim_size; // broadcasting dimension size + Index inner_dim_size; // inner dimensions size + + // Block sizes and strides for the input block where all dimensions before + // `bcast_dim` are equal to `1`. + Dimensions input_block_sizes; + Dimensions input_block_strides; + + // Block sizes and strides for blocks with extra dimensions and strides `0`. + BroadcastDimensions bcast_block_sizes; + BroadcastDimensions bcast_block_strides; + BroadcastDimensions bcast_input_strides; + }; + + struct BlockBroadcastingIteratorState { + Index size; + Index count; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams + blockBroadcastingParams(TensorBlockDesc& desc) const { + BlockBroadcastingParams params; + + params.input_dims = Dimensions(m_impl.dimensions()); + + // Output block sizes and strides. + params.output_dims = desc.dimensions(); + params.output_strides = internal::strides(params.output_dims); + + // Find the broadcasting dimension (first dimension with output size smaller + // that the input size). + params.bcast_dim = 0; + params.bcast_dim_size = 1; + params.inner_dim_size = 1; + + // Count the number of inner dimensions that have the same size in the block + // and in the broadcast expression. + params.inner_dim_count = 0; + + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + if (params.output_dims[dim] == m_dimensions[dim]) { + params.inner_dim_size *= params.output_dims[dim]; + ++params.inner_dim_count; + continue; + } + + // First non-matching dimension is the broadcasting dimension. + eigen_assert(params.output_dims[dim] < m_dimensions[dim]); + params.bcast_dim = dim; + params.bcast_dim_size = params.output_dims[dim]; + break; + } + + // Calculate the input block size for looking into the input. + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = params.input_dims[dim]; + } + for (int i = params.inner_dim_count; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = 1; + } + params.input_block_strides = + internal::strides(params.input_block_sizes); + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // + // - bcast_block_sizes: + // [d_0, b_0, d_1, b_1, ...] + // + // - bcast_block_strides: + // [output_block_strides[0], output_block_strides[0] * d_0, + // output_block_strides[1], output_block_strides[1] * d_1, + // ...] + // + // - bcast_input_strides: + // [input_block_strides[0], 0, + // input_block_strides[1], 0, + // ...]. + // + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1; + + params.bcast_block_sizes[copy_dim] = params.input_dims[dim]; + params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + params.bcast_block_strides[copy_dim] = params.output_strides[dim]; + params.bcast_block_strides[broadcast_dim] = + params.output_strides[dim] * params.input_dims[dim]; + params.bcast_input_strides[copy_dim] = params.input_block_strides[dim]; + params.bcast_input_strides[broadcast_dim] = 0; + } + + for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) { + const int dim = IsColMajor ? i : 2 * NumDims - i - 1; + params.bcast_block_sizes[dim] = 1; + params.bcast_block_strides[dim] = 0; + params.bcast_input_strides[dim] = 0; + } + + return params; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { + DSizes dimensions; + for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; + return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( + BlockBroadcastingParams params, Index bcast_offset, + TensorBlockScratch& scratch, ScalarNoConst* materialized_output, + ScalarNoConst** materialized_input, + size_t* materialized_input_size) const { + if (params.bcast_dim_size == 1) { + // We just need one block read using the ready-set values above. + return BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else if (params.input_dims[params.bcast_dim] == 1) { + // Broadcast bcast dimension (< NumDims) by bcast_dim_size. + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 + : 2 * NumDims - 2 * params.inner_dim_count - 2; + + params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim]; + + return BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else { + // Keep track of the total number of the coefficients written to the + // output block. + Index num_output_coeffs = 0; + + // The general case. Let's denote the output block as + // + // x[..., a:a+bcast_dim_size, :, ..., :] + // + // where a:a+bcast_dim_size is a slice on the bcast_dim dimension + // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 + // sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] + // in [a, a+bcast_dim_size]. + // + // (3) c:a+bcast_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index bcast_dim_left_index = + bcast_offset / m_outputStrides[params.bcast_dim]; + + // Find b and c. + const Index input_bcast_dim_size = params.input_dims[params.bcast_dim]; + + // First multiple after a. This is b when <= bcast_dim_left_index + + // bcast_dim_size. + const Index first_multiple = + divup(bcast_dim_left_index, input_bcast_dim_size) * + input_bcast_dim_size; + + if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = + (bcast_dim_left_index + params.bcast_dim_size) / + input_bcast_dim_size * input_bcast_dim_size; + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + : 2 * NumDims - 2 * params.inner_dim_count - 1; + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 + : 2 * NumDims - 2 * params.inner_dim_count - 2; + + if (first_multiple > bcast_dim_left_index) { + const Index head_size = first_multiple - bcast_dim_left_index; + params.input_block_sizes[params.bcast_dim] = head_size; + params.bcast_block_sizes[copy_bcast_dim] = head_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (first_multiple < last_multiple) { + params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = + (last_multiple - first_multiple) / input_bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + const Index offset = (first_multiple - bcast_dim_left_index) * + m_outputStrides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) { + const Index tail_size = + bcast_dim_left_index + params.bcast_dim_size - last_multiple; + params.input_block_sizes[params.bcast_dim] = tail_size; + params.bcast_block_sizes[copy_bcast_dim] = tail_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + const Index offset = (last_multiple - bcast_dim_left_index) * + m_outputStrides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + } else { + // b and c do not exist. + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + : 2 * NumDims - 2 * params.inner_dim_count - 1; + params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + + return num_output_coeffs; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( + const Dimensions& input_block_sizes, + const Dimensions& input_block_strides, + const BroadcastDimensions& bcast_block_sizes, + const BroadcastDimensions& bcast_block_strides, + const BroadcastDimensions& bcast_input_strides, Index bcast_offset, + Index offset, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, + size_t* materialized_input_size) const { + // ---------------------------------------------------------------------- // + // Tensor block descriptor for reading block from the input. + const Index input_offset = bcast_offset + offset; + TensorBlockDesc input_desc( + IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), + input_block_sizes); + + ArgTensorBlock input_block = m_impl.block(input_desc, scratch); + + // ---------------------------------------------------------------------- // + // Materialize input block into a temporary memory buffer only if it's not + // already available in the arg block. + const ScalarNoConst* input_buffer = NULL; + + if (input_block.data() != NULL) { + // Input block already has raw data, there is no need to materialize it. + input_buffer = input_block.data(); + + } else { + // Otherwise we have to do block assignment into a temporary buffer. + + // Maybe reuse previously allocated buffer, or allocate a new one with a + // scratch allocator. + const size_t input_total_size = input_block_sizes.TotalSize(); + if (*materialized_input == NULL || + *materialized_input_size < input_total_size) { + *materialized_input_size = input_total_size; + void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); + *materialized_input = static_cast(mem); + } + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(input_block_sizes, input_block_strides, + *materialized_input), + input_block.expr()); + + input_buffer = *materialized_input; + } + + // ---------------------------------------------------------------------- // + // Copy data from materialized input block to the materialized output, using + // given broadcast strides (strides with zeroes). + typedef internal::TensorBlockIO + TensorBlockIO; + + typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); + typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, + materialized_output + offset); + + return TensorBlockIO::Copy(dst, src); + } - protected: - const Broadcast m_broadcast; +protected: + const Device EIGEN_DEVICE_REF m_device; + const typename internal::remove_reference::type m_broadcast; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 1ba7ef17..37645734 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -32,12 +32,13 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions - 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorChippingOp& type; + typedef const TensorChippingOp EIGEN_DEVICE_REF type; }; template @@ -50,6 +51,7 @@ template struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { @@ -78,44 +80,28 @@ template class TensorChippingOp : public TensorBase > { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) - : m_xpr(expr), m_offset(offset), m_dim(dim) { - } + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } - EIGEN_DEVICE_FUNC - const Index offset() const { return m_offset; } - EIGEN_DEVICE_FUNC - const Index dim() const { return m_dim.actualDim(); } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp) protected: typename XprType::Nested m_xpr; @@ -136,20 +122,49 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + Layout = TensorEvaluator::Layout, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + // Chipping of outer-most dimension is a trivial operation, because we can + // read and write directly from the underlying tensor using single offset. + IsOuterChipping = (static_cast(Layout) == ColMajor && DimId == NumInputDims - 1) || + (static_cast(Layout) == RowMajor && DimId == 0), + // Chipping inner-most dimension. + IsInnerChipping = (static_cast(Layout) == ColMajor && DimId == 0) || + (static_cast(Layout) == RowMajor && DimId == NumInputDims - 1), + // Prefer block access if the underlying expression prefers it, otherwise + // only if chipping is not trivial. + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + !IsOuterChipping, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef internal::TensorBlockDescriptor + ArgTensorBlockDesc; + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -185,12 +200,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -205,21 +220,20 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + if (isInnerChipping()) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; } PacketReturnType rslt = internal::pload(values); return rslt; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet(index + m_inputOffset); } else { @@ -231,6 +245,7 @@ struct TensorEvaluator, Device> } else { // Cross the stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; @@ -263,29 +278,100 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { - CoeffReturnType* result = const_cast(m_impl.data()); - if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && - result) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + const Index chip_dim = m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] + = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); + + // Try to reuse destination buffer for materializing argument block. + if (desc.HasDestinationBuffer()) { + DSizes arg_destination_strides; + for (int i = 0; i < NumInputDims; ++i) { + arg_destination_strides[i] + = i < chip_dim ? desc.destination().strides()[i] + : i > chip_dim ? desc.destination().strides()[i - 1] + : 0; // for dimensions of size `1` stride should never be used. + } + + arg_desc.template AddDestinationBuffer( + desc.destination().template data(), + arg_destination_strides); + } + + ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + + if (arg_block.data() != NULL) { + // Forward argument block buffer if possible. + return TensorBlock(arg_block.kind(), arg_block.data(), + desc.dimensions()); + + } else { + // Assign argument block expression to a buffer. + + // Prepare storage for the materialized chipping result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + arg_desc.dimensions(), + internal::strides(arg_desc.dimensions()), + block_storage.data()), + arg_block.expr()); + + return block_storage.AsTensorMaterializedBlock(); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (isOuterChipping() && result) { return result + m_inputOffset; } else { return NULL; } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + if (isInnerChipping()) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer + // division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; } else { @@ -297,13 +383,25 @@ struct TensorEvaluator, Device> return inputIndex; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const { + return IsInnerChipping || + (static_cast(Layout) == ColMajor && m_dim.actualDim() == 0) || + (static_cast(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const { + return IsOuterChipping || + (static_cast(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == RowMajor && m_dim.actualDim() == 0); + } + Dimensions m_dimensions; Index m_stride; Index m_inputOffset; Index m_inputStride; TensorEvaluator m_impl; const internal::DimensionId m_dim; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; }; @@ -321,15 +419,21 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -343,20 +447,19 @@ struct TensorEvaluator, Device> { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + if (this->isInnerChipping()) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } - } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. + } else if (this->isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); } else { @@ -369,6 +472,7 @@ struct TensorEvaluator, Device> // Cross stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; @@ -376,6 +480,36 @@ struct TensorEvaluator, Device> } } } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + const Index chip_dim = this->m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + typedef TensorReshapingOp, + const typename TensorBlock::XprType> + TensorBlockExpr; + + typedef internal::TensorBlockAssignment + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target( + input_block_dims, + internal::strides(this->m_impl.dimensions()), + this->m_impl.data(), this->srcCoeff(desc.offset())), + block.expr().reshape(input_block_dims)); + } }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 59bf90d9..5235a8e6 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -37,6 +37,8 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -58,6 +60,7 @@ template class TensorConcatenationOp : public TensorBase, WriteAccessors> { public: + typedef TensorBase, WriteAccessors> Base; typedef typename internal::traits::Scalar Scalar; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; @@ -79,25 +82,7 @@ class TensorConcatenationOp : public TensorBase Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp) protected: typename LhsXprType::Nested m_lhs_xpr; typename RhsXprType::Nested m_rhs_xpr; @@ -117,14 +102,24 @@ struct TensorEvaluator::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -177,14 +172,14 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { left_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } } else { left_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } @@ -231,11 +228,13 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { right_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } } else { right_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } @@ -248,11 +247,12 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); } @@ -275,7 +275,15 @@ struct TensorEvaluator XprType; typedef typename Base::Dimensions Dimensions; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) { EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -344,7 +360,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 20b29e5f..8b35f798 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -21,8 +21,8 @@ namespace Eigen { */ namespace internal { -template -struct traits > +template +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename gebp_traits::type, @@ -38,53 +38,305 @@ struct traits > typedef typename remove_reference::type _RhsNested; // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, + typename traits::PointerType>::type + PointerType; enum { Flags = 0 }; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorContractionOp& type; + typedef const TensorContractionOp& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorContractionOp type; + typedef TensorContractionOp type; }; -template -struct traits, Device_> > { +template +struct traits, Device_> > { typedef Indices_ Indices; typedef LeftArgType_ LeftArgType; typedef RightArgType_ RightArgType; + typedef OutputKernelType_ OutputKernelType; typedef Device_ Device; // From NumDims below. static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; }; +// Helper class to allocate and deallocate temporary memory for packed buffers. +template +struct TensorContractionBlockMemAllocator { + typedef void* BlockMemHandle; + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, + const Index bk, + const Index bn, + LhsScalar** lhs_block, + RhsScalar** rhs_block) { + eigen_assert(lhs_block); + eigen_assert(rhs_block); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); + eigen_assert(block_mem); + *lhs_block = reinterpret_cast(block_mem); + *rhs_block = reinterpret_cast(block_mem + sz.lhs_size); + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices( + Device& d, const Index bm, const Index bk, const Index bn, + const Index num_lhs, const Index num_rhs, const Index num_slices, + std::vector* lhs_blocks, + std::vector* rhs_blocks) { + eigen_assert(num_slices > 0); + eigen_assert(num_lhs >= 0 && num_rhs >= 0); + eigen_assert(num_lhs == 0 || lhs_blocks); + eigen_assert(num_rhs == 0 || rhs_blocks); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + void* block_mem = d.allocate( + (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); + eigen_assert(block_mem); + char* mem = static_cast(block_mem); + + for (Index x = 0; x < num_slices; x++) { + if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); + for (Index m = 0; m < num_lhs; m++) { + lhs_blocks[x][m] = reinterpret_cast(mem); + mem += sz.lhs_size; + } + if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); + for (Index n = 0; n < num_rhs; n++) { + rhs_blocks[x][n] = reinterpret_cast(mem); + mem += sz.rhs_size; + } + } + + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + d.deallocate(handle); + } + + private: + struct BlockSizes { + Index lhs_size; + Index rhs_size; + }; + EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, + const Index bk, + const Index bn) { + Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + BlockSizes sz; + sz.lhs_size = divup(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = divup(bn * bk * sizeof(RhsScalar), align) * align; + return sz; + } +}; + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen contraction evaluator. +// This also includes full support in TensorContractionThreadPool, assuming that +// underlying gemm do not use it's own threading. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +template +struct TensorContractionKernel { + // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` + // (otherwise beta should be always equal to 1). + enum { HasBeta = false }; + + EIGEN_DEVICE_FUNC + TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, + StorageIndex bm_, StorageIndex bk_, StorageIndex bn_) + : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {} + + // Pack blocks of Lhs and Rhs into contiguous blocks in memory. + typedef LhsScalar* LhsBlock; + typedef RhsScalar* RhsBlock; + + // Packed Lhs/Rhs block memory allocator. + typedef TensorContractionBlockMemAllocator + BlockMemAllocator; + typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; + + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs< + LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, + RhsBlock* rhs_block) { + return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); + } + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices( + Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, + const StorageIndex num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + return BlockMemAllocator::allocateSlices( + d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + BlockMemAllocator::deallocate(d, handle); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs( + LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, + /*offset*/ 0); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs( + RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(*rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke( + const OutputMapper& output_mapper, const LhsBlock& lhsBlock, + const RhsBlock& rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const ResScalar alpha, const ResScalar beta) { + // Default GEBP kernel does not support beta. + eigen_assert(beta == ResScalar(1)); + static const int kComputeStrideFromBlockDimensions = -1; + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ kComputeStrideFromBlockDimensions, + /*strideB*/ kComputeStrideFromBlockDimensions, + /*offsetA*/ 0, /*offsetB*/ 0); + } + + private: + // These are dimensions of the original Tensors, and selected block sizes. The + // actual block sizes passed to all function above might be smaller because of + // the partial blocks at the end. + const StorageIndex m; + const StorageIndex k; + const StorageIndex n; + const StorageIndex bm; + const StorageIndex bk; + const StorageIndex bn; +}; + } // end namespace internal -template -class TensorContractionOp : public TensorBase, ReadOnlyAccessors> +// Tensor contraction params that should enable to get from output matrix +// 2-dimensional coordinates to the output tensor dimensions. +struct TensorContractionParams { + // TensorContraction evaluator assumes that both tensors are in ColMajor + // layout, if tensors are in RowMajor evaluator swap lhs with rhs. + bool swapped_arguments; +}; + +// Output kernel allows to fuse operations into the tensor contraction. +// +// Examples: +// 1. Elementwise Relu transformation following Conv2D. +// 2. AddBias to the Conv2D output channels dimension. +// +// The NoOpOutputKernel implements an output kernel that does absolutely nothing. +struct NoOpOutputKernel { + /** + * Tensor contraction evaluator calls this kernel after finishing each block + * of output matrix. Output blocks belong to the 2-dimensional output tensor. + * + * TensorContractionParams contains contraction dimensions information + * required to map output 2-d space into the expected output tensor space + * (potentially higher dimensional). + * + * \param[in] output_mapper Access to output tensor memory + * \param[in] params Tensor contraction parameters + * \param[in] i Index of a first row available through output_mapper + * \param[in] j Index of a first column available through output_mapper + * \param[in] num_rows Number of available rows + * \param[in] num_cols Number of available columns + */ + template + EIGEN_ALWAYS_INLINE void operator()( + const internal::blas_data_mapper& output_mapper, + const TensorContractionParams& params, Index i, + Index j, Index num_rows, Index num_cols) const { + EIGEN_UNUSED_VARIABLE(output_mapper); + EIGEN_UNUSED_VARIABLE(params); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(num_rows); + EIGEN_UNUSED_VARIABLE(num_cols); + } +}; + +template +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims, + const OutputKernelType& output_kernel = OutputKernelType()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), + m_output_kernel(output_kernel) {} EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; } @@ -98,35 +350,48 @@ class TensorContractionOp : public TensorBase::type& rhsExpression() const { return m_rhs_xpr; } + EIGEN_DEVICE_FUNC + const OutputKernelType& outputKernel() const { return m_output_kernel; } + protected: typename LhsXprType::Nested m_lhs_xpr; typename RhsXprType::Nested m_rhs_xpr; const Indices m_indices; + const OutputKernelType m_output_kernel; }; template -struct TensorContractionEvaluatorBase +struct TensorContractionEvaluatorBase : internal::no_assignment_operator { typedef typename internal::traits::Indices Indices; typedef typename internal::traits::LeftArgType LeftArgType; typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::OutputKernelType OutputKernelType; typedef typename internal::traits::Device Device; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + // Most of the code is assuming that both input tensors are ColMajor. If the // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: // If we want to compute A * B = C, where A is LHS and B is RHS, the code @@ -136,6 +401,9 @@ struct TensorContractionEvaluatorBase typedef typename internal::conditional< static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + typedef TensorEvaluator LeftEvaluatorType; + typedef TensorEvaluator RightEvaluatorType; + static const int LDims = internal::array_size::Dimensions>::value; static const int RDims = @@ -149,16 +417,17 @@ struct TensorContractionEvaluatorBase typedef DSizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), m_device(device), + m_output_kernel(op.outputKernel()), m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), + static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -233,7 +502,7 @@ struct TensorContractionEvaluatorBase // dimensions and right non-contracting dimensions. m_lhs_inner_dim_contiguous = true; int dim_idx = 0; - unsigned int nocontract_idx = 0; + Index nocontract_idx = 0; for (int i = 0; i < LDims; i++) { // find if we are contracting on index i of left tensor @@ -323,64 +592,144 @@ struct TensorContractionEvaluatorBase numext::swap(m_dimensions[i], m_dimensions[j]); } } + + // A set of parameters that will allow output kernel to get from output + // tensor dimensions (i, j) into the original tensor dimensions. + // TODO(ezhulenev): Add parameters required to infer output tensor index for + // more complex contractions than 2x2 on internal dimension. + m_tensor_contraction_params.swapped_arguments = static_cast(Layout) == RowMajor; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { evalTo(data); return false; } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } } - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + if (dest) { + evalToAsync(dest, [done]() { done(false); }); + } else { + m_result = static_cast( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalToAsync(m_result, [done]() { done(true); }); } - } + }); + }); + } +#endif // EIGEN_USE_THREADS + +#ifndef TENSOR_CONTRACTION_DISPATCH +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } +#endif + +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH +#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } +#endif + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + static_cast(this)->template evalProduct(buffer); + } + +#ifdef EIGEN_USE_THREADS + template + void evalToAsync(Scalar* buffer, EvalToCallback done) const { + static_cast(this) + ->template evalProductAsync(buffer, + std::move(done)); + } +#endif // EIGEN_USE_THREADS + + template + void evalProductSequential(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + } else { + this->template evalGemm(buffer); } } template - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -418,12 +767,41 @@ struct TensorContractionEvaluatorBase internal::general_matrix_vector_product::run( rows, cols, lhs, rhs, buffer, resIncr, alpha); + + typedef internal::blas_data_mapper OutputMapper; + m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, + static_cast(0), static_cast(0), rows, + static_cast(1)); } template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; + this->template evalGemmPartial(buffer, 0, k, 1); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel( + Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + evalGemmPartial(buffer, k_start, k_end, + num_threads); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); + // columns in slice on left side, rows on right side + const Index k_slice = k_end - k_start; // rows in left side const Index m = this->m_i_size; @@ -431,16 +809,9 @@ struct TensorContractionEvaluatorBase // columns in right side const Index n = this->m_j_size; - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types + // define data mappers for Lhs and Rhs typedef typename internal::remove_const::type LhsScalar; typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; typedef TensorEvaluator LeftEvaluator; typedef TensorEvaluator RightEvaluator; @@ -462,11 +833,9 @@ struct TensorContractionEvaluatorBase typedef internal::blas_data_mapper OutputMapper; - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; // initialize data mappers LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, @@ -478,42 +847,72 @@ struct TensorContractionEvaluatorBase OutputMapper output(buffer, m); // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); + internal::TensorContractionBlocking + blocking(k_slice, m, n, num_threads); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + + LhsBlock blockA; + RhsBlock blockB; + + TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); + + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + const BlockMemHandle packed_mem = + kernel.allocate(this->m_device, &blockA, &blockB); + + // If a contraction kernel does not support beta, explicitly initialize + // output buffer with zeroes. + if (!TensorContractionKernel::HasBeta) { + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + } for(Index i2=0; i2= k_end) { + m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, + actual_mc, actual_nc); + } } } } - this->m_device.deallocate(blockA); - this->m_device.deallocate(blockB); + kernel.deallocate(this->m_device, packed_mem); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -536,11 +935,9 @@ struct TensorContractionEvaluatorBase return internal::ploadt(m_result + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } - protected: - // Prevent assignment - TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); +protected: Dimensions m_dimensions; contract_t m_k_strides; @@ -560,22 +957,25 @@ struct TensorContractionEvaluatorBase Index m_j_size; Index m_k_size; + TensorContractionParams m_tensor_contraction_params; + TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; - const Device& m_device; - Scalar* m_result; + const Device EIGEN_DEVICE_REF m_device; + OutputKernelType m_output_kernel; + EvaluatorPointerType m_result; }; // evaluator for default device -template -struct TensorEvaluator, Device> : +template +struct TensorEvaluator, Device> : public TensorContractionEvaluatorBase< - TensorEvaluator, Device> > { - typedef TensorEvaluator, Device> Self; + TensorEvaluator, Device> > { + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -609,17 +1009,12 @@ struct TensorEvaluator Dimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - template - EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - this->template evalGemm(buffer); + template + void evalProduct(Scalar* buffer) const { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); } }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 5cf7b4f7..974feb0a 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -21,14 +21,28 @@ enum { // Default Blocking Strategy -template +template class TensorContractionBlocking { public: - typedef typename LhsMapper::Scalar LhsScalar; - typedef typename RhsMapper::Scalar RhsScalar; + /* + adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` + requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + (else HIPCC will error out) - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + results in NVCC erroring out with the following error + + ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: + dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function + */ + + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -37,19 +51,22 @@ class TensorContractionBlocking { else { computeProductBlockingSizes(kc_, nc_, mc_, num_threads); } + + const int rhs_packet_size = internal::packet_traits::size; + kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? + kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } private: - Index kc_; - Index mc_; - Index nc_; + StorageIndex kc_; + StorageIndex mc_; + StorageIndex nc_; }; - } // end namespace internal } // end namespace Eigen diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index d65dbb40..3f315fed 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1,1391 +1,6 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014-2015 Benoit Steiner -// Copyright (C) 2015 Navdeep Jaitly -// Copyright (C) 2014 Eric Martin -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#if defined(__clang__) || defined(__GNUC__) +#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file" +#endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - -namespace Eigen { - -template -__device__ EIGEN_STRONG_INLINE void -EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, - const Index m_size, const Index n_size, const Index k_size) { - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - // declare and initialize 64 registers for output 8x8 block - - // prefetch registers - Scalar lhs_pf0; - Scalar lhs_pf1; - Scalar lhs_pf2; - Scalar lhs_pf3; - Scalar lhs_pf4; - Scalar lhs_pf5; - Scalar lhs_pf6; - Scalar lhs_pf7; - - Scalar rhs_pf0; - Scalar rhs_pf1; - Scalar rhs_pf2; - Scalar rhs_pf3; - Scalar rhs_pf4; - Scalar rhs_pf5; - Scalar rhs_pf6; - Scalar rhs_pf7; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // On the LHS, we pad each row inside of each block with an extra element. This makes - // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts - // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. - - // On the RHS we just add 8 padding elements to the end of each block. This gives no bank - // conflicts on writes and also none on reads. - - // storage indices - const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; - const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; - - const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; - const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; - const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; - const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; - const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; - const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; - const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; - - const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; - const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; - const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; - const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; - const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; - const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; - const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; - const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; - - // in the loading code, the following variables are important: - // threadIdx.x: the vertical position in an 8x8 block - // threadIdx.y: the vertical index of the 8x8 block in the grid - // threadIdx.z: the horizontal position in an 8x8 block - // k: the horizontal index of the 8x8 block in the grid - // - // The k parameter is implicit (it was the loop counter for a loop that went - // from 0 to <8, but now that loop is unrolled in the below code. - - const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; - const Index lhs_vert = base_m + load_idx_vert; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = conv(0); \ - lhs_pf1 = conv(0); \ - lhs_pf2 = conv(0); \ - lhs_pf3 = conv(0); \ - lhs_pf4 = conv(0); \ - lhs_pf5 = conv(0); \ - lhs_pf6 = conv(0); \ - lhs_pf7 = conv(0); \ - \ - rhs_pf0 = conv(0); \ - rhs_pf1 = conv(0); \ - rhs_pf2 = conv(0); \ - rhs_pf3 = conv(0); \ - rhs_pf4 = conv(0); \ - rhs_pf5 = conv(0); \ - rhs_pf6 = conv(0); \ - rhs_pf7 = conv(0); \ - \ - if (!needs_edge_check || lhs_vert < m_size) { \ - const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ - const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ - const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ - const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ - const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ - const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ - const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ - const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ - \ - if (!needs_edge_check || lhs_horiz_7 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ - } else if (lhs_horiz_6 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - } else if (lhs_horiz_5 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - } else if (lhs_horiz_4 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - } else if (lhs_horiz_3 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - } else if (lhs_horiz_2 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - } else if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - } \ - } \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (!needs_edge_check || rhs_vert < k_size) { \ - const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ - const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ - const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ - const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ - const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ - const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ - const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ - const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ - \ - if (rhs_horiz_7 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ - } else if (rhs_horiz_6 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - } else if (rhs_horiz_5 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - } else if (rhs_horiz_4 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - } else if (rhs_horiz_3 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - } else if (rhs_horiz_2 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - } else if (rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - } \ - } \ - } \ - -#define writeRegToShmem(_) \ - lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ - rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ - \ - lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ - rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ - \ - lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ - rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ - \ - lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ - rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ - \ - lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ - rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ - \ - lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ - rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ - \ - lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ - rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ - \ - lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ - rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = conv(0); \ - Scalar res(i, 1) = conv(0); \ - Scalar res(i, 2) = conv(0); \ - Scalar res(i, 3) = conv(0); \ - Scalar res(i, 4) = conv(0); \ - Scalar res(i, 5) = conv(0); \ - Scalar res(i, 6) = conv(0); \ - Scalar res(i, 7) = conv(0); \ - - internal::scalar_cast_op conv; - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - - #undef prefetchIntoRegisters - #undef writeRegToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - -#define lcol(i) _lcol##i - Scalar lcol(0); - Scalar lcol(1); - Scalar lcol(2); - Scalar lcol(3); - Scalar lcol(4); - Scalar lcol(5); - Scalar lcol(6); - Scalar lcol(7); - -#define rrow(j) _rrow##j - Scalar rrow(0); - Scalar rrow(1); - Scalar rrow(2); - Scalar rrow(3); - Scalar rrow(4); - Scalar rrow(5); - Scalar rrow(6); - Scalar rrow(7); - - // Now x corresponds to k, y to m, and z to n - const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; - -#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] -#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] - -#define loadData(i, j) \ - lcol(0) = lhs_element(0, j); \ - rrow(0) = rhs_element(i, 0); \ - lcol(1) = lhs_element(1, j); \ - rrow(1) = rhs_element(i, 1); \ - lcol(2) = lhs_element(2, j); \ - rrow(2) = rhs_element(i, 2); \ - lcol(3) = lhs_element(3, j); \ - rrow(3) = rhs_element(i, 3); \ - lcol(4) = lhs_element(4, j); \ - rrow(4) = rhs_element(i, 4); \ - lcol(5) = lhs_element(5, j); \ - rrow(5) = rhs_element(i, 5); \ - lcol(6) = lhs_element(6, j); \ - rrow(6) = rhs_element(i, 6); \ - lcol(7) = lhs_element(7, j); \ - rrow(7) = rhs_element(i, 7); \ - -#define computeCol(j) \ - res(0, j) += lcol(0) * rrow(j); \ - res(1, j) += lcol(1) * rrow(j); \ - res(2, j) += lcol(2) * rrow(j); \ - res(3, j) += lcol(3) * rrow(j); \ - res(4, j) += lcol(4) * rrow(j); \ - res(5, j) += lcol(5) * rrow(j); \ - res(6, j) += lcol(6) * rrow(j); \ - res(7, j) += lcol(7) * rrow(j); \ - -#define computePass(i) \ - loadData(i, i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol -#undef rrow -#undef lhs_element -#undef rhs_element -#undef loadData -#undef computeCol -#undef computePass - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (2) is slightly faster than (1) due to less branching and more ILP - - // TODO: won't yield much gain, but could just use currently unused shared mem - // and then we won't have to sync - // wait for shared mem to be out of use - __syncthreads(); - -#define writeResultShmem(i, j) \ - lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ - -#define writeRow(i) \ - writeResultShmem(i, 0); \ - writeResultShmem(i, 1); \ - writeResultShmem(i, 2); \ - writeResultShmem(i, 3); \ - writeResultShmem(i, 4); \ - writeResultShmem(i, 5); \ - writeResultShmem(i, 6); \ - writeResultShmem(i, 7); \ - - if (threadIdx.x == 0) { - writeRow(0); - writeRow(1); - writeRow(2); - writeRow(3); - writeRow(4); - writeRow(5); - writeRow(6); - writeRow(7); - } -#undef writeResultShmem -#undef writeRow - - const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); - - if (threadIdx.x < max_i_write) { - if (max_j_write == 8) { - // TODO: can i trade bank conflicts for coalesced writes? - Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; - Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; - Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; - Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; - Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; - Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; - Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; - Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; - - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; - } else { -#pragma unroll 7 - for (int j = 0; j < max_j_write; j++) { - Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; - } - } - } -#undef res -} - - -template -__global__ void -__launch_bounds__(512) -EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ Scalar lhs_shmem[72 * 64]; - __shared__ Scalar rhs_shmem[72 * 64]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][16], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, rhs_pf0; - - float4 results[4]; - for (int i=0; i < 4; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ - - - Index lhs_vert = base_m+threadIdx.x*4; - - for (Index k = 0; k < k_size; k += 16) { - lhs_pf0 = internal::pset1(0); - rhs_pf0 = internal::pset1(0); - - Index lhs_horiz = threadIdx.y+k; - prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) - - Index rhs_vert = k+(threadIdx.x%4)*4; - Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; - - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } else { - if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - float x1, x2 ; - // the following can be a bitwise operation..... some day. - if((threadIdx.x%8) < 4) { - x1 = rhs_pf0.y; - x2 = rhs_pf0.w; - } else { - x1 = rhs_pf0.x; - x2 = rhs_pf0.z; - } - x1 = __shfl_xor(x1, 4); - x2 = __shfl_xor(x2, 4); - if((threadIdx.x%8) < 4) { - rhs_pf0.y = x1; - rhs_pf0.w = x2; - } else { - rhs_pf0.x = x1; - rhs_pf0.z = x2; - } - - // We have 64 features. - // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. - // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. - // ... - // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 - // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 - // ... - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); - - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // ... - // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) - // ... - - lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); - - -#define add_vals(fl1, fl2, fr1, fr2)\ - results[0].x += fl1.x * fr1.x;\ - results[0].y += fl1.y * fr1.x;\ - results[0].z += fl2.x * fr1.x;\ - results[0].w += fl2.y * fr1.x;\ -\ - results[1].x += fl1.x * fr1.y;\ - results[1].y += fl1.y * fr1.y;\ - results[1].z += fl2.x * fr1.y;\ - results[1].w += fl2.y * fr1.y;\ -\ - results[2].x += fl1.x * fr2.x;\ - results[2].y += fl1.y * fr2.x;\ - results[2].z += fl2.x * fr2.x;\ - results[2].w += fl2.y * fr2.x;\ -\ - results[3].x += fl1.x * fr2.y;\ - results[3].y += fl1.y * fr2.y;\ - results[3].z += fl2.x * fr2.y;\ - results[3].w += fl2.y * fr2.y;\ - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 16; koff ++) { - // 32 x threads. - float2 fl1 = lhs_shmem2[koff][threadIdx.x]; - float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; - - int start_feature = threadIdx.y * 4; - float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - - add_vals(fl1, fl2, fr1, fr2) - } - __syncthreads(); - } - -#undef prefetch_lhs -#undef add_vals - - Index horiz_base = threadIdx.y*4+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - // CHECK LHS - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK RHS - /* - int ncols_rem = fminf(n_size- horiz_base, 4); - for (int i = 0; i < ncols_rem; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - }*/ - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][32], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; - float4 rhs_pf0, rhs_pf1; - - float4 results[8]; - for (int i=0; i < 8; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - - Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; - for (Index k = 0; k < k_size; k += 32) { - lhs_pf0 = internal::pset1(0); - lhs_pf1 = internal::pset1(0); - lhs_pf2 = internal::pset1(0); - lhs_pf3 = internal::pset1(0); - - rhs_pf0 = internal::pset1(0); - rhs_pf1 = internal::pset1(0); - - if (!CHECK_LHS_BOUNDARY) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else { - // just CHECK_LHS_BOUNDARY - if (lhs_vert + 3 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 2 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 1 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - } - } else if (lhs_vert < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - } - } - } - __syncthreads(); - Index rhs_vert = k+threadIdx.x*4; - Index rhs_horiz0 = threadIdx.y*2+base_n; - Index rhs_horiz1 = threadIdx.y*2+1+base_n; - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else { - if (rhs_horiz1 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (k+threadIdx.x*4 + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (k+threadIdx.x*4 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - __syncthreads(); - // Loaded. Do computation - // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. - // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. - // .. - // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 - rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); - // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. - // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. - // .. - rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); - // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. - // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. - rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); - // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. - // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. - rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); - - // LHS. - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // ... - // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - - -#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ - results[0].x += a_feat1.x * f1.x;\ - results[1].x += a_feat1.x * f1.y;\ - results[2].x += a_feat1.x * f2.x;\ - results[3].x += a_feat1.x * f2.y;\ - results[4].x += a_feat1.x * f3.x;\ - results[5].x += a_feat1.x * f3.y;\ - results[6].x += a_feat1.x * f4.x;\ - results[7].x += a_feat1.x * f4.y;\ -\ - results[0].y += a_feat1.y * f1.x;\ - results[1].y += a_feat1.y * f1.y;\ - results[2].y += a_feat1.y * f2.x;\ - results[3].y += a_feat1.y * f2.y;\ - results[4].y += a_feat1.y * f3.x;\ - results[5].y += a_feat1.y * f3.y;\ - results[6].y += a_feat1.y * f4.x;\ - results[7].y += a_feat1.y * f4.y;\ -\ - results[0].z += a_feat2.x * f1.x;\ - results[1].z += a_feat2.x * f1.y;\ - results[2].z += a_feat2.x * f2.x;\ - results[3].z += a_feat2.x * f2.y;\ - results[4].z += a_feat2.x * f3.x;\ - results[5].z += a_feat2.x * f3.y;\ - results[6].z += a_feat2.x * f4.x;\ - results[7].z += a_feat2.x * f4.y;\ -\ - results[0].w += a_feat2.y * f1.x;\ - results[1].w += a_feat2.y * f1.y;\ - results[2].w += a_feat2.y * f2.x;\ - results[3].w += a_feat2.y * f2.y;\ - results[4].w += a_feat2.y * f3.x;\ - results[5].w += a_feat2.y * f3.y;\ - results[6].w += a_feat2.y * f4.x;\ - results[7].w += a_feat2.y * f4.y;\ - - lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); - lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); - lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); - - lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); - lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); - lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); - lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 32; koff ++) { - float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; - float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; - - // first feature is at (threadIdx.y/4) * 8 last is at start + 8. - int start_feature = (threadIdx.y / 4) * 8; - - float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; - float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; - float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; - float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - - add_vals(a3, a4, br1, br2, br3, br4) - } - __syncthreads(); - } // end loop over k - - - __syncthreads(); - Index horiz_base = (threadIdx.y/4)*8+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK BOUNDARY_B - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[64*32]; - __shared__ float2 rhs_shmem[128*8]; - - typedef float2 LHS_MEM[64][32]; - typedef float2 RHS_MEM[128][8]; - - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 128 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - bool check_rhs = (base_n + 63) >= n_size; - bool check_lhs128 = (base_m + 127) >= m_size; - - if (!check_rhs) { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } else { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } -} - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[32][16]; - __shared__ float2 rhs_shmem[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size) { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } else { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } -} - - -template -struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - } - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - }; - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - } - }; - - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); - LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_GPU and __CUDACC__ -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#include "TensorContractionGpu.h" diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h new file mode 100644 index 00000000..c8180382 --- /dev/null +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -0,0 +1,1413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(512, 1) +#else +__launch_bounds__(512) +#endif +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ __forceinline__ void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.template loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.template loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + #else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); + #endif + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ __forceinline__ void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) + { + EIGEN_STATIC_ASSERT( (internal::is_same::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_GPU_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + +#if defined(EIGEN_USE_HIP) + setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); +#else + setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); +#endif + + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_GPUCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index c28a10dd..9ab900b4 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -22,8 +22,19 @@ enum { /* * Implementation of the Eigen blas_data_mapper class for tensors. */ - -template struct CoeffLoader { +/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which +/// is scalar * for CoeffLoader. +template class MakePointer_ = MakePointer> +struct CoeffLoader; + +template class MakePointer_ = MakePointer> +class BaseTensorContractionMapper; + +template class MakePointer_> +struct CoeffLoader { enum { DirectOffsets = false }; @@ -34,6 +45,12 @@ template struct CoeffLoader { eigen_assert(false && "unsupported"); } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type + data() const { + eigen_assert(false && "unsupported"); + return NULL; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -42,12 +59,19 @@ template struct CoeffLoader { return m_tensor.template packet(index); } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif private: const Tensor m_tensor; }; -template struct CoeffLoader { +template class MakePointer_> +struct CoeffLoader { enum { DirectOffsets = true }; @@ -58,6 +82,11 @@ template struct CoeffLoader { m_data += offset; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type + data() const { + return m_data; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -65,15 +94,23 @@ template struct CoeffLoader { { return internal::ploadt_ro(m_data + index); } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } + #endif private: typedef typename Tensor::Scalar Scalar; - const Scalar* m_data; + + typename MakePointer_::Type m_data; }; template + int packet_size, bool inner_dim_contiguous, int Alignment, template class MakePointer_ = MakePointer> class SimpleTensorContractionMapper { public: EIGEN_DEVICE_FUNC @@ -89,7 +126,7 @@ class SimpleTensorContractionMapper { m_k_strides(k_strides) { } enum { - DirectOffsets = CoeffLoader::DirectOffsets + DirectOffsets = CoeffLoader::DirectOffsets }; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { @@ -116,6 +153,7 @@ class SimpleTensorContractionMapper { EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = nocontract_val / m_ij_strides[i]; linidx += idx * m_nocontract_strides[i]; @@ -132,6 +170,7 @@ class SimpleTensorContractionMapper { Index contract_val = left ? col : row; if(array_size::value > 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = contract_val / m_k_strides[i]; linidx += idx * m_contract_strides[i]; @@ -156,6 +195,7 @@ class SimpleTensorContractionMapper { Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size::value > array_size::value) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = nocontract_val[0] / m_ij_strides[i]; const Index idx1 = nocontract_val[1] / m_ij_strides[i]; @@ -176,6 +216,7 @@ class SimpleTensorContractionMapper { Index contract_val[2] = {left ? col : row, left ? col : row + distance}; if (array_size::value> 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = contract_val[0] / m_k_strides[i]; const Index idx1 = contract_val[1] / m_k_strides[i]; @@ -207,24 +248,41 @@ class SimpleTensorContractionMapper { return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + + const CoeffLoader& tensor() const { + return m_tensor; + } + + const nocontract_t& nocontract_strides() const { + return m_nocontract_strides; + } + const nocontract_t& ij_strides() const { return m_ij_strides; } + const contract_t& contract_strides() const { return m_contract_strides; } + const contract_t& k_strides() const { return m_k_strides; } + protected: - CoeffLoader m_tensor; + CoeffLoader m_tensor; const nocontract_t m_nocontract_strides; const nocontract_t m_ij_strides; const contract_t m_contract_strides; const contract_t m_k_strides; }; - template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment, template class MakePointer_> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -234,12 +292,11 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::half HalfPacket; - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size==packet_size,PacketT>::type + load(Index i, Index j) const + { // whole method makes column major assumption // don't need to add offsets for now (because operator handles that) @@ -254,7 +311,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper indexPair = this->computeIndexPair(i, j, packet_size - 1); const Index first = indexPair.first; - const Index last = indexPair.second; + const Index lastIdx = indexPair.second; // We can always do optimized packet reads from left hand side right now, because // the vertical matrix dimension on the left hand side is never contracting. @@ -262,7 +319,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { + (lastIdx - first) == (packet_size - 1)) { return this->m_tensor.template packet(first); } @@ -270,31 +327,44 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapperm_tensor.coeff(first); + EIGEN_UNROLL_LOOP for (Index k = 1; k < packet_size - 1; k += 2) { const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); data[k] = this->m_tensor.coeff(internal_pair.first); data[k + 1] = this->m_tensor.coeff(internal_pair.second); } - data[packet_size - 1] = this->m_tensor.coeff(last); + data[packet_size - 1] = this->m_tensor.coeff(lastIdx); - return pload(data); + return pload(data); } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size!=packet_size,PacketT>::type + load(Index i, Index j) const + { + const Index requested_packet_size = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); + const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); } - return pload(data); + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + return this->load(i,j); } }; @@ -303,11 +373,12 @@ template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment, template class MakePointer_> +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -317,16 +388,17 @@ class BaseTensorContractionMapper EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); + return pload(data); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); } }; @@ -335,14 +407,12 @@ template + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template class MakePointer_=MakePointer> class TensorContractionSubMapper { public: - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; - typedef BaseTensorContractionMapper ParentMapper; - typedef TensorContractionSubMapper Self; + typedef BaseTensorContractionMapper ParentMapper; + typedef TensorContractionSubMapper Self; typedef Self LinearMapper; enum { @@ -374,27 +444,32 @@ class TensorContractionSubMapper { return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, j); + return m_base_mapper.template loadPacket(i, j); } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { if (UseDirectOffsets) { - return m_base_mapper.template loadHalfPacket(i, 0); + return m_base_mapper.template load(i, j); } - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); } @@ -410,19 +485,30 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - template + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_base_mapper.bind(cgh); + } + #endif + + const ParentMapper& base_mapper() const { return m_base_mapper; } + Index vert_offset() const { return m_vert_offset; } + Index horiz_offset() const { return m_horiz_offset; } + private: ParentMapper m_base_mapper; const Index m_vert_offset; @@ -434,14 +520,14 @@ template + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template class MakePointer_=MakePointer> class TensorContractionInputMapper - : public BaseTensorContractionMapper { + : public BaseTensorContractionMapper { public: typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; typedef SubMapper VectorMapper; EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, @@ -459,9 +545,29 @@ class TensorContractionInputMapper EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(*this, i, j); } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader& get_tensor() const { + return Base::m_tensor; + } }; +template struct TensorContractionInputMapperTrait; + +template class MakePointer_> +struct TensorContractionInputMapperTrait > { + + typedef Tensor_ XprType; + static const bool inner_dim_contiguous = inner_dim_contiguous_; + static const bool inner_dim_reordered = inner_dim_reordered_; + }; + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h new file mode 100755 index 00000000..473c2284 --- /dev/null +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -0,0 +1,1650 @@ +// This file is part of Eigen, a lightweight C++ template library for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not +// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorContractionSycl.h + * + * \brief: + * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend + * + *****************************************************************/ + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_DISABLE_GEMV +/*! + * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector + * contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor/vector + * + * \tparam StorageIndex determines the Index type. + * + * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group + * + * \tparam CFactor: determines the number of contracting element to be process by each thread + * + * \tparam NCFactor: determines the number of non-contracting element to be process by each thread + */ +template +struct TVPanelSize { + // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimNC: determines the tile size for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + // TileSizeDimC: determines the tile size for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = false; +}; +#endif + +/*! + * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor + contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor + * + * \tparam StorageIndex: determines the Index type. + * + * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). + * + * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). + * + * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered + */ + +template +struct TTPanelSize { + // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered + static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the + // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// +#ifndef EIGEN_SYCL_REG_M + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; +#endif +// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the +// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro +#ifndef EIGEN_SYCL_REG_N + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; +#endif + // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimM: determines the tile size for the m dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + // TileSizeDimN: determines the tile size for the n dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); + // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = true; + // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by + // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory) + static EIGEN_CONSTEXPR bool DoubleBuffer = +#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER + false; +#else + true; +#endif +}; + +/* ! + * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to + * specialize the contraction algorithm based on device support for dedicated local memory. + */ +enum class contraction_type { local, no_local }; +/* ! + * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). + */ +enum class data_source { global_mem, local_mem, private_mem }; + +/*! + * \brief read, a template function used for loading the data from global + memory. This function is used to guarantee coalesced and vectorized load whenever possible + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex is the contracting dim index + * + * \param ld: is the leading dimension of the flattened tensor + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { + const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; + const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; + return tensorMapper.get_tensor().template packet(row + (col * ld)); +} + +/*! + * \brief read, special overload of read function, when the read access is not vectorized + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex: is the contracting dim index + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { + const StorageIndex row = (IsRhs) ? CIndex : NCIndex; + const StorageIndex col = (IsRhs) ? NCIndex : CIndex; + return tensorMapper(row, col); +} + +/*! + * \brief write, a template function used for storing the data to local memory. This function is used to guarantee + * coalesced and vectorized store whenever possible. + * + * \tparam StorageIndex: determines the Index type + * + * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + * + * \param CIndex is the contracting dim index + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if

::type + write(PacketType &packet_data, DataScalar ptr) { + EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits::size; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; i++) { + *ptr = PacketWrapper::scalarize(i, packet_data); + ptr += ld; + } +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function + * is used to guarantee coalesced and vectorized store whenever possible. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size != 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + ::Eigen::internal::pstoreu(ptr, packet_data); +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size == 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + *ptr = packet_data; +} + +/*! + * \brief check_boundary: is used to check the edge condition for non-internal blocks. + * + * \tparam is_internal: determines if the block is internal + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { + return true; +} + +/*! + * \brief check_boundary: specialization of the check_boundary for non-internal blocks. + * + * \param cond: true when the data is in range. Otherwise false + */ +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool cond) { + return cond; +} + +/*! + * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed + * by each workgroup. + * + * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed + * + * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode + * + * \tparam PacketType: determines the type of packet + * + * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be + * packetType; Otherwise it will be scalar Type + * + * \param elements_per_access determines the size of each element based on OutType + * + * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and + * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + * + * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + */ +template +struct BlockProperties { + static EIGEN_CONSTEXPR bool packet_load = packet_load_; + typedef typename Eigen::internal::unpacket_traits::type OutScalar; + static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + typedef typename Eigen::internal::conditional::type OutType; + static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits::size; + static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); +}; + +/*! + * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see + * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, + * work-items + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \param linearLocalThreadId: determines the linearized location of a thread within a work-group + * + * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when + * tall/skinny algorithm is used + * + * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of + * the flattened tensor. + * + * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the + * flattened tensor. It will be > 1 when tall/skinny algorithm is used. + * + * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a + * flattened tensor + * + * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a + * flattened tensor + * + * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the + * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. + * + * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or + * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be + * resolve by compiler. + */ +template +struct ThreadProperties { + const StorageIndex linearLocalThreadId; + const StorageIndex kGroupId; + const StorageIndex mGroupOffset; + const StorageIndex nGroupOffset; + const StorageIndex kGroupOffset; + const StorageIndex mLocalOffset; + const StorageIndex nLocalOffset; + const StorageIndex mGlobalOffset; + const StorageIndex nGlobalOffset; + StorageIndex kSize; + const bool is_internal; + // this is used to adjust the last block + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( + const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, + const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, + const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, + StorageIndex kSize_, const bool is_internal_) + : linearLocalThreadId(linearLocalThreadId_), + kGroupId(kGroupId_), + mGroupOffset(mGroupOffset_), + nGroupOffset(nGroupOffset_), + kGroupOffset(kGroupOffset_), + mLocalOffset(mLocalOffset_), + nLocalOffset(nLocalOffset_), + mGlobalOffset(mGlobalOffset_), + nGlobalOffset(nGlobalOffset_), + kSize(kSize_), + is_internal(is_internal_) {} +}; + +/*! + * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory + access is used to guarantee that always the memory access are coalesced. + * + * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. + Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny + contraction is used. So in this case, a final reduction step is required to compute final output. + + * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of + the algorithm to be used + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param groupSizeM: a logical number determining the number of work-group for m dimension + * + * \param groupSizeN: a logical number determining the number of work-group for n dimension + * + * \param numTiles: determines total number of tiles on the k dimension + * + * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + */ +template +class TensorContractionKernel { + public: + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + static EIGEN_CONSTEXPR bool is_lhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + + typedef BlockProperties + LHSBlockProperties; + + typedef BlockProperties + RHSBlockProperties; + + static EIGEN_CONSTEXPR StorageIndex NStride = + contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; + + typedef cl::sycl::accessor Scratch; + typedef cl::sycl::multi_ptr local_ptr; + typedef OutScalar * /*cl::sycl::multi_ptr*/ private_ptr; + typedef + typename ::Eigen::internal::conditional::type + tile_ptr; + static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + + /** + * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not + * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to + * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting + * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out + * different type of memory needed when local/no_local memory computation is called. + * + * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation + of the algorithm to be used + * \tparam the private memory size + * \param ptr the tile memory pointer type + */ + template + struct MemHolder { + tile_ptr ptr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} + }; + /** + * \brief specialization of memHolder class when no local memory kernel is used. + */ + template + struct MemHolder { + OutScalar ptr[MemSize] = {OutScalar{0}}; + }; + /** + * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from + * global memory to local/private memory when local/no_local algorithm used. + * + * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the + * same as lhs_scratch_extract for private memory. + * + * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the + * same as rhs_scratch_extract for private memory. + */ + struct TiledMemory { + MemHolder lhs_scratch_extract; + MemHolder rhs_scratch_extract; + tile_ptr lhs_scratch_ptr_compute; + tile_ptr rhs_scratch_ptr_compute; + const std::pair lhs_extract_index; + const std::pair rhs_extract_index; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &, local_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{}, + rhs_scratch_extract{}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), + lhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})), + rhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &thread_properties, local_ptr block_start_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{block_start_ptr}, + rhs_scratch_extract{lhs_scratch_extract.ptr + + ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), + lhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)), + rhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)) {} + }; + + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex groupSizeM; + const StorageIndex groupSizeN; + const StorageIndex numTiles; + const TripleDim triple_dim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex groupSizeN_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : scratch(scratch_), + lhs(lhs_), + rhs(rhs_), + out_res(out_res_), + groupSizeM(groupSizeM_), + groupSizeN(groupSizeN_), + numTiles(numTiles_), + triple_dim(triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; + const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; + const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; + const StorageIndex tmp = itemID.get_group(0) / groupSizeM; + const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; + const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; + const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; + const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; + const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; + const StorageIndex nLocalOffset = NStride * nLocalThreadId; + const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; + const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; + + const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; + StorageIndex kGroupOffset = kGroupId * kSizePerWG; + const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && + triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && + triple_dim.K - kGroupOffset >= kSizePerWG; + // this is used to adjust the last block + StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); + // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to + // tile + kGroupOffset += kSize; + + auto thread_properties = + ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, + mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); + + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + + (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) + : compute_panel(itemID, thread_properties, out_ptr); + } + // The compute block computes the contraction operation private block for each thread and store the resutl in the + // privateRes memory of Each computation the compute block function is independent of local and no local concepts as + // it only compute the block on each thread's private memory space + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, + PacketReturnType *privateRes) { + StorageIndex idx = 0; + EIGEN_CONSTEXPR StorageIndex lhs_stride = + contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { + auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; + StorageIndex lhs_index = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType lhsPack{}; + Eigen::TensorSycl::internal::PacketWrapper::set_packet(lhsPack, + lhs_block_ptr + lhs_index); + privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); + + lhs_index += lhs_stride; + idx++; + } + } + } + // The store function write the computed contraction operation in the private memory of each thread to the global + // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base + // class. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { + auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { + return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); + }; + // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is + // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* + // WorkLoadPerThreadN slice of N + EIGEN_CONSTEXPR StorageIndex GlobalNStride = + contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { + // output leading dimension + StorageIndex outputLD = 0; + // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local + // memory and extracting from local to global is the same as no transposed version. However, when local memory is + // not used and RHS is transposed we packetize the load for RHS. + EIGEN_UNROLL_LOOP + for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { + StorageIndex globalRow = mGlobalOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType privetOut = privateRes[wLPTM]; + if (check_boundary(chk_bound(globalRow, nId))) { + // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second + // StorageIndex Therefore it is always coalesced layout + write(privetOut, out_ptr + outputLD + globalRow); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex mId = 0; mId < PacketSize; mId++) { + StorageIndex mOffset = globalRow + mId; + if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { + out_ptr[mOffset + outputLD] = + Eigen::TensorSycl::internal::PacketWrapper::scalarize(mId, privetOut); + } + } + } + globalRow += (PacketSize * Properties::LocalThreadSizeM); + } + outputLD += triple_dim.M; + privateRes += Properties::WorkLoadPerThreadM / PacketSize; + } + out_ptr += (GlobalNStride * outputLD); + + nGlobalOffset += (PrivateNStride * GlobalNStride); + } + } + // when no local memory is used the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair &, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; + EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + StorageIndex cIndex = cOffset; + + EIGEN_UNROLL_LOOP + for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { + StorageIndex ncIndex = ncOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { + if (check_boundary(chk_bound(cIndex, ncIndex))) { + auto val = + read(inpt, ncIndex, cIndex, ld); + + write(val, private_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (ncInd < NC && cInd < triple_dim.K) + ? read( + inpt, ncInd, cInd, ld) + : OutScalar(0); + write( + val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); + } + } + + // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So + // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. + ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) + ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC + : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); + private_ptr += InputBlockProperties::nc_stride; + } + // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC + private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; + cIndex += InputBlockProperties::c_stride; + } + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair local_id_extract( + const StorageIndex &linearLocalThreadId) { + const StorageIndex localThreadNC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + const StorageIndex localThreadC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + return std::pair(localThreadNC, localThreadC); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { + db_offset = !db_offset; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept { + return; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> & +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID +#endif + ) noexcept { +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID.barrier(cl::sycl::access::fence_spacce::local_space); +#else + return; +#endif + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> &itemID) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type sync_thread( + const cl::sycl::nd_item<1> &) { + return; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + TiledMemory &tiled_input_block, + PacketReturnType *privateRes, bool &db_offset) { + // Tiling the Rhs block from global to local memory + extract_block( + rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), + tiled_input_block.rhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + sync_thread(itemID); + + // Tiling the Lhs block from global to local memory + extract_block( + lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), + tiled_input_block.lhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + // itemID.barrier(cl::sycl::access::fence_space::local_space); + sync_thread(itemID); + // switch to compute mede + StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); + StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); + // Loop over the values of a single tile + for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { + compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, + tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); + lhs_offset += LSDL; + rhs_offset += LSDR; + } + // computing the K index for the next tile + thread_properties.kSize -= Properties::TileSizeDimK; + sync_mem(itemID, db_offset); + } + + // when local memory is available the following compute_panel will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + OutPtr out_ptr) { + auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; + // Allocate register space + PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { + PacketReturnType{0}}; + bool db_offset = 0; + + while (thread_properties.kSize >= Properties::TileSizeDimK) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + if (thread_properties.kSize > 0) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + + // Storing the final results in the output + store(1) : RHSBlockProperties::nc_stride>( + out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, + thread_properties.nGlobalOffset); + } + // When local memory is available the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, Local local_ptr, const std::pair& local_index, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; + EIGEN_CONSTEXPR StorageIndex LoadPerThread = + InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; + EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && + (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), + " LocalOffset must be divisable by stride"); + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + StorageIndex localThreadNC = local_index.first; + StorageIndex localThreadC = local_index.second; + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + EIGEN_UNROLL_LOOP + for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { + const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); + const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + if (check_boundary(chk_bound(CIndex, NCIndex))) { + auto val = + read(inpt, NCIndex, CIndex, ld); + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::c_stride * localThreadC * LSD)); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (nCInd < NC && cInd < triple_dim.K) + ? read( + inpt, nCInd, cInd, ld) + : OutScalar(0); + + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::c_stride * localThreadC + + (InputBlockProperties::is_coalesced_layout ? 0 : i)) * + LSD)); + } + } + localThreadNC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + localThreadC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + } + } +}; + +#ifndef EIGEN_SYCL_DISABLE_GEMV + +/*! + * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special + * case of Tensor Tensor contraction. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) + * + * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam KFactor: determines the number of elements in K dimension in a Tile + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector + * + * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. + * Otherwise, the result of contraction will be written iin a temporary buffer. + * + * \param scratch: determines the local memory containing the vector block for each work-group + * + * \param vec: determines the vector input (tensor mapper) + * + * \param mat: determines the tensor input (tensor mapper) + * + * \param out_res: determines the output vector containing the contraction result + * + * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension + * + * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor + * + * \param contractDim: determines the size of non contracting dimension for the flattened tensor + * + */ +template +struct GeneralVectorTensor { + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + typedef cl::sycl::accessor Scratch; + + static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + + // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make + // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. + typedef BlockProperties + VecBlockProperties; + + Scratch scratch; + const VectorMapper vec; + const TensorMapper mat; + OutAccessor out_res; + const StorageIndex nonContractGroupSize; + const StorageIndex nonContractDim; + const StorageIndex contractDim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, + const TensorMapper mat_, OutAccessor out_res_, + const StorageIndex nonContractGroupSize_, + const StorageIndex nonContractDim_, + const StorageIndex contractDim_) + : scratch(scratch_), + vec(vec_), + mat(mat_), + out_res(out_res_), + nonContractGroupSize(nonContractGroupSize_), + nonContractDim(nonContractDim_), + contractDim(contractDim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto scratch_ptr = scratch.get_pointer(); + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC + : linearLocalThreadId % Properties::LocalThreadSizeNC; + StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC + : linearLocalThreadId / Properties::LocalThreadSizeNC; + const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; + const StorageIndex nonContractGroupId = + is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; + const StorageIndex contractGroupId = + is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); + + const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; + const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; + auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; + const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; + auto local_output = scratch_ptr + OutScratchOffset; + const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && + contractDim - contractGroupOffset >= Properties::TileSizeDimC; + is_internal + ? compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) + : compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( + const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, + OutPtr out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, +#endif + const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, + StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, + StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { + OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; + // Reading the vector +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; + extract_block(vec, scratch_ptr, linearLocalThreadId, + vectorOffset, contractDim); + + itemID.barrier(cl::sycl::access::fence_space::local_space); + auto in_scratch_ptr = scratch_ptr + contractId; +#endif + + StorageIndex privateOffsetC = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { + StorageIndex privateOffsetNC = 0; + bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + auto vecScalar = *in_scratch_ptr; +#else + auto vecScalar = (check_boundary(contract_conds)) + ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, + is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) + : OutScalar(0); +#endif + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + auto matScalar = (check_boundary( + contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) + ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC + : globalNonContractDimOffset + privateOffsetNC, + is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC + : globalContractDimOffset + privateOffsetC) + : OutScalar(0); + + outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); + privateOffsetNC += Properties::LocalThreadSizeNC; + } + privateOffsetC += Properties::LocalThreadSizeC; +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + in_scratch_ptr += Properties::LocalThreadSizeC; +#endif + } + + auto out_scratch_ptr = local_output + outScratchIndex; + // Each block of 16*16 element in shared memory should reduce to 16*1 + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + *out_scratch_ptr = outScalar[j]; + + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + if (is_lhs_vec) { + nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; + contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; + outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + } + + out_scratch_ptr = local_output + outScratchIndex; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + EIGEN_UNROLL_LOOP + for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (contractId < offset) { + StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); + *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; + } + } + // moving to next 16 by 16 block + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + + if (contractId == 0) { + out_scratch_ptr = local_output + nonContractId; + StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; + out_ptr += global_final_offset; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + if (check_boundary(global_final_offset < nonContractDim)) { + auto res = *out_scratch_ptr; + + *out_ptr = res; + out_ptr += Properties::LocalThreadSizeNC; + } + // moving to next 16 by 16 block to ge the next 16 reduced elements + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; + } + } + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, + const StorageIndex &linearLocalThreadId, + const StorageIndex &cOffset, const StorageIndex &C) { + local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; + StorageIndex cIndex = cOffset; + for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { + if (check_boundary(cIndex + InputBlockProperties::c_stride - 1 < C)) { + auto val = read(inpt, StorageIndex(0), + cIndex, StorageIndex(1)); + write(val, local_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + OutScalar val = + (cIndex + i < C) + ? read( + inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) + : OutScalar(0); + write(val, local_ptr + i); + } + } + local_ptr += InputBlockProperties::c_stride * GroupSize; + cIndex += InputBlockProperties::c_stride * GroupSize; + } + } +}; +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + +/*! + * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction + * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param rng: determins the total input data size + */ +template +struct GeneralScalarContraction { + typedef cl::sycl::accessor Scratch; + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex rng; + + EIGEN_DEVICE_FUNC + GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex rng_) + : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} + + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_res.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + StorageIndex globalid = itemID.get_global_id(0); + StorageIndex localid = itemID.get_local_id(0); + OutScalar accumulator = OutScalar(0); + for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { + accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); + } + auto out_scratch_ptr = scratch_ptr + localid; + *out_scratch_ptr = accumulator; + for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); + } + } + if (localid == 0) { + out_ptr[itemID.get_group(0)] = accumulator; + } + } +}; +#endif + +} // namespace internal +} // namespace TensorSycl + +template +struct TensorEvaluator, + Eigen::SyclDevice> + : public TensorContractionEvaluatorBase, Eigen::SyclDevice>> { + static_assert(std::is_same::value, + "SYCL tensor contraction does not support output kernels."); + + typedef Eigen::SyclDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index StorageIndex; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Base::Storage Storage; + typedef typename Base::EvaluatorPointerType EvaluatorPointerType; + struct TripleDim { + const StorageIndex M; + const StorageIndex N; + const StorageIndex K; + TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} + }; + enum { + Layout = TensorEvaluator::Layout, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + }; + + static EIGEN_CONSTEXPR int LDims = Base::LDims; + static EIGEN_CONSTEXPR int RDims = Base::RDims; + static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + typedef typename Eigen::internal::remove_const::type LhsScalar; + typedef typename Eigen::internal::remove_const::type RhsScalar; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + template + struct input_mapper_propertis { + static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_matrix = + (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); + }; + + TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (!data) { + this->m_result = this->m_device.get( + static_cast(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); + data = this->m_result; + } + evalToSycl(data); + return (this->m_result != NULL); + } + const Eigen::SyclDevice &device() const { return this->m_device; } + void evalToSycl(typename Base::EvaluatorPointerType buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } + } + + template + void evalTyped(typename Base::EvaluatorPointerType buffer) const { + const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; + typedef internal::TensorContractionInputMapper< + LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, + PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> + LhsMapper; + + typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> + RhsMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + if (triple_dim.M == 1 && triple_dim.N == 1) { + launchSC(buffer, lhs, rhs, triple_dim.K); + } else +#endif +#ifndef EIGEN_SYCL_DISABLE_GEMV + if (triple_dim.M != 1 && triple_dim.N == 1) { + LaunchVT(buffer, rhs, lhs, triple_dim.M, triple_dim.K); + } else if (triple_dim.M == 1 && triple_dim.N != 1) { + LaunchVT(buffer, lhs, rhs, triple_dim.N, triple_dim.K); + } else // This is equivalent of if (m!=1 && n!=1) +#endif + { + typedef input_mapper_propertis + inpt_mapper_properties; +#ifndef EIGEN_SYCL_DISABLE_SKINNY + bool skinny = false; + auto platform_name = this->device().getPlatformName(); + // This is based on empirical calculation for AMD r9-nano and Fiji + if (platform_name.find("AMD") == 0) { + skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && + ((triple_dim.M < 1024 && triple_dim.N < 1024) || + (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); + } else { + skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || + ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || + ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); + } + if (skinny) + adjustTT(buffer, lhs, rhs, triple_dim); + else +#endif // EIGEN_SYCL_DISABLE_SKINNY + adjustTT(buffer, lhs, rhs, triple_dim); + } + } + + template + void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + if (device().has_local_memory()) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF + if (!(device().has_local_memory())) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif + } + + template + void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { + const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); + const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); + const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; + const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; + + const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); + StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; + StorageIndex groupSizeK = + skinny + ? std::max(std::min(totalTilesK, + (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / + (groupSizeM * groupSizeN)), + StorageIndex(1)) + : StorageIndex(1); + + const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; + + const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; + + const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + const StorageIndex globalRange = totalGroupSize * localRange; + + const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) + ? ((Properties::DoubleBuffer + 1) * + (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + + ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * + (Properties::TileSizeDimN + Properties::BC)) + : StorageIndex(1); + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (groupSizeK == 1) { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + device().template binary_kernel_launcher( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); + } else { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + CoeffReturnType *temp_pointer = static_cast( + device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher( + lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, + triple_dim); + + typedef Eigen::internal::SumReducer Op; + auto op = Op(); + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( + Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), + cl::sycl::range<1>(localRange)), + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); + + device().deallocate_temp(temp_pointer); + } + } + +#ifndef EIGEN_SYCL_DISABLE_GEMV + template + void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, + StorageIndex NC, StorageIndex C) const { + const StorageIndex nonContractDim = NC; + EIGEN_CONSTEXPR StorageIndex NCFactor = 1; + EIGEN_CONSTEXPR StorageIndex CFactor = 1; + EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + typedef Eigen::TensorSycl::internal::TVPanelSize + Properties; + const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); + const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); + const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); + const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); + const StorageIndex globalRange = + (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); + const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; + const StorageIndex scratchSize = + (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (cNumGroups > 1) { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), + cl::sycl::range<1>(localRange)), + StorageIndex(1), Op(), nonContractDim, cNumGroups); + + device().deallocate_temp(temp_pointer); + } else { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + device().template binary_kernel_launcher( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + } + } +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + template + EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + StorageIndex K) const { + EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread + // reduces at least 512 elementss individually, we get better performance. + const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); + const StorageIndex global_range = num_work_group * local_range; + + typedef Eigen::TensorSycl::internal::GeneralScalarContraction< + CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> + ContractKernelName; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + device().template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, + thread_range, local_range, K); + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); + + device().deallocate_temp(temp_pointer); + } else { + device().template binary_kernel_launcher(lhs, rhs, buffer, thread_range, + local_range, K); + } + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + this->m_leftImpl.cleanup(); + this->m_rightImpl.cleanup(); + + if (this->m_result) { + this->m_device.deallocate_temp(this->m_result); + this->m_result = NULL; + } + } + // The placeholder accessors must bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + this->m_leftImpl.bind(cgh); + this->m_rightImpl.bind(cgh); + this->m_result.bind(cgh); + } +}; +} // namespace Eigen +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index c70dea05..21be6ea4 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,57 +15,16 @@ namespace Eigen { -#ifdef EIGEN_USE_SIMPLE_THREAD_POOL -namespace internal { - -template -struct packLhsArg { - LhsScalar* blockA; - const LhsMapper& lhs; - const Index m_start; - const Index k_start; - const Index mc; - const Index kc; -}; - -template -struct packRhsAndKernelArg { - const MaxSizeVector* blockAs; - RhsScalar* blockB; - const RhsMapper& rhs; - OutputMapper& output; - const Index m; - const Index k; - const Index n; - const Index mc; - const Index kc; - const Index nc; - const Index num_threads; - const Index num_blockAs; - const Index max_m; - const Index k_block_idx; - const Index m_block_idx; - const Index n_block_idx; - const Index m_blocks; - const Index n_blocks; - MaxSizeVector* kernel_notifications; - const MaxSizeVector* lhs_notifications; - const bool need_to_pack; -}; - -} // end namespace internal -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - -template -struct TensorEvaluator, ThreadPoolDevice> : - public TensorContractionEvaluatorBase, ThreadPoolDevice> > { +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { typedef ThreadPoolDevice Device; - typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -112,31 +71,35 @@ struct TensorEvaluator + template void evalProduct(Scalar* buffer) const { - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - typedef internal::blas_data_mapper OutputMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; - typedef internal::gebp_kernel - GebpKernel; + evalProductImpl(buffer, NoCallback()); + } + + template + void evalProductAsync(Scalar* buffer, EvalToCallback done) const { + evalProductImpl(buffer, std::move(done)); + } + + template + void evalProductImpl(Scalar* buffer, DoneCallback done) const { + // This function computes a lot of heuristics in multiple steps, and it + // also has multiple exit points. To keep it sane, readable and all in one + // place, sync/async execution decision is made at runtime at the very end. + // + // (1) In sync mode we allocate Context on the stack, submit computations + // to the device thread pool, and block on a barrier until it is + // completed. + // + // (2) In async mode we allocate Context on the heap, and after all tasks + // are finished, we call provided the done callback, and delete a + // context from the heap. + // + // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state + // and temporary buffers, requried for executing the tensor contraction. + // They are responsible for cleaning it up after contraction is done. + static const bool IsEvalInSyncMode = + std::is_same::value; const Index m = this->m_i_size; const Index n = this->m_j_size; @@ -172,14 +135,14 @@ struct TensorEvaluator blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, 2); bm = blocking.mc(); @@ -195,35 +158,45 @@ struct TensorEvaluator::numThreads( static_cast(n) * m, cost, this->m_device.numThreads()); + int num_threads_by_k = numThreadsInnerDim(m, n, k); + if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) { + // We are in the scenario where it is more effective to shard by the + // inner dimension. + if (IsEvalInSyncMode) { + EvalShardedByInnerDimContext ctx( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx.template run(); + } else { + auto* ctx = new EvalShardedByInnerDimContext( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx->template runAsync(); + } + + return; + } // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost // model is not tuned. Remove this when the cost model is tuned. if (n == 1) num_threads = 1; if (num_threads == 1) { - // The single-threaded algorithm should be faster in this case. - if (n == 1) - this->template evalGemv(buffer); - else - this->template evalGemm(buffer); + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, + Unaligned, (buffer)); + if (!IsEvalInSyncMode) done(); return; } // Now that we know number of threads, recalculate sharding and blocking. shard_by_col = shardByCol(m, n, num_threads); if (shard_by_col) { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); @@ -255,6 +228,26 @@ struct TensorEvaluatorm_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. With large number of threads we trade maximum + // parallelism for better memory locality. + const float oversharding_factor = + num_worker_threads <= 4 ? 8.0 : + num_worker_threads <= 8 ? 4.0 : + num_worker_threads <= 16 ? 2.0 : + num_worker_threads <= 32 ? 1.0 : + num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6; + + const bool parallelize_by_sharding_dim_only = + sharding_dim_tasks >= oversharding_factor * num_worker_threads; + // Last by not least, decide whether we want to issue both lhs and rhs // packing in parallel; or issue lhs packing first, and then issue rhs // packing when lhs packing completes (for !shard_by_col lhs and rhs are @@ -270,40 +263,139 @@ struct TensorEvaluatorm_leftImpl, this->m_left_nocontract_strides, - this->m_i_strides, this->m_left_contracting_strides, - this->m_k_strides); + } else { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ + nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \ + std::move(done)) + TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, + Alignment, CONTEXT_ARGS, run()); +#undef CONTEXT_ARGS + } + } - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, - this->m_j_strides, this->m_right_contracting_strides, - this->m_k_strides); + // ------------------------------------------------------------------------ // - Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, - k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, - shard_by_col, parallel_pack) - .run(); - } + // Dummy struct to represent an empty DoneCallback. + + struct NoCallback { + void operator()() { + eigen_assert(false && "NoCallback should never be called"); + } + }; + + // ------------------------------------------------------------------------ // - // Context coordinates a single parallel gemm operation. - template - class Context { + template + class EvalParallelNotification; + + // Synchronous evaluation notification that blocks caller thread in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context*, NoCallback) {} + void Notify() { done_.Notify(); } + void Wait() { done_.Wait(); } + private: + Eigen::Notification done_; + }; + + // Asynchronous evaluation notification that does not block in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context* ctx, DoneCallback done) + : ctx_(ctx), done_(std::move(done)) {} + + void Notify() { + // Make a copy of done callback, because it will be destructed when we + // will delete context in the next line (EvalParallelNotification is a + // data member of EvalParallelContext class). + DoneCallback done_copy = std::move(done_); + + // Delete parallel evaluation context. + delete ctx_; + + // Now safely call the done callback. + done_copy(); + } + + void Wait() {} + + private: + Context* ctx_; + DoneCallback done_; + }; + + // Context orchestrates sync/async parallel contraction evaluation. When it is + // executed in asynchronous mode, it owns all the shared state that might be + // accessible by block packing and kernel tasks. + + template + class EvalParallelContext { public: - Context(const Device& device, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, - Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack) - : device_(device), - lhs_(lhs), - rhs_(rhs), + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + + EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, + Index tm, Index tn, Index tk, Index bm, Index bn, + Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack, + bool parallelize_by_sharding_dim_only, + DoneCallback done) + : created_by_thread_id_(std::this_thread::get_id()), + done_(this, std::move(done)), + device_(self->m_device), + lhs_(self->m_leftImpl, self->m_left_nocontract_strides, + self->m_i_strides, self->m_left_contracting_strides, + self->m_k_strides), + rhs_(self->m_rightImpl, self->m_right_nocontract_strides, + self->m_j_strides, self->m_right_contracting_strides, + self->m_k_strides), buffer_(buffer), output_(buffer, tm), + output_kernel_(self->m_output_kernel), + tensor_contraction_params_(self->m_tensor_contraction_params), num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), + parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), m_(tm), n_(tn), k_(tk), @@ -316,13 +408,29 @@ struct TensorEvaluator(bm_ * bk_ * sizeof(LhsScalar), align) * align; - size_t rhs_size = - divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; - packed_mem_ = static_cast(internal::aligned_malloc( - (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); - char* mem = static_cast(packed_mem_); - for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { - packed_lhs_[x].resize(nm0_); - for (Index m = 0; m < nm0_; m++) { - packed_lhs_[x][m] = reinterpret_cast(mem); - mem += lhs_size; - } - packed_rhs_[x].resize(nn0_); - for (Index n = 0; n < nn0_; n++) { - packed_rhs_[x][n] = reinterpret_cast(mem); - mem += rhs_size; + packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/nm0_, // + /*num_rhs=*/nn0_, // + /*num_slices=*/std::min(nk_, P - 1), // + packed_lhs_, packed_rhs_); + + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); + + if (shard_by_col) { + can_use_thread_local_packed_ = new std::atomic[nn_]; + for (int i = 0; i < nn_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gn_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/0, // + /*num_rhs=*/num_blocks, // + /*num_slices=*/1, // + /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_); + + } else { + can_use_thread_local_packed_ = new std::atomic[nm_]; + for (int i = 0; i < nm_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gm_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/num_blocks, // + /*num_rhs=*/0, // + /*num_slices=*/1, &lhs_thread_local_pre_allocated_, // + /*rhs_blocks=*/nullptr); } } } - ~Context() { + ~EvalParallelContext() { for (Index x = 0; x < P; x++) { for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; delete[] state_kernel_[x]; } - internal::aligned_free(packed_mem_); + kernel_.deallocate(device_, packed_mem_); + if (parallelize_by_sharding_dim_only_) { + kernel_.deallocate(device_, thread_local_pre_alocated_mem_); + delete[] can_use_thread_local_packed_; + } } void run() { // Kick off packing of the first slice. signal_switch(0, 1); + // Wait for overall completion. - // TODO(dvyukov): this wait can lead to deadlock. - // If nthreads contractions are concurrently submitted from worker - // threads, this wait will block all worker threads and the system will - // deadlock. + // + // If parallel evaluation is executed in async mode, this is a no-op, and + // Wait() will return immediately. In synchronous mode it will block the + // caller thread until it will receive notification from last task. + // + // In async mode, last task when completed will call done callback from + // the same thread, and will delete this context. + // + // TODO(dvyukov): This wait can lead to deadlock if contraction is + // evaluated in synchronous mode. If nthreads contractions are + // concurrently submitted from worker threads, this wait will block all + // worker threads and the system will deadlock. done_.Wait(); } private: - Notification done_; + std::thread::id created_by_thread_id_; + + // This notification is specialized on the type of DoneCallback and can be + // blocking or non-blocking. + EvalParallelNotification done_; + const Device& device_; - LhsMapper& lhs_; - RhsMapper& rhs_; + LhsMapper lhs_; + RhsMapper rhs_; Scalar* const buffer_; OutputMapper output_; + OutputKernelType output_kernel_; + TensorContractionParams tensor_contraction_params_; const int num_threads_; const bool shard_by_col_; const bool parallel_pack_; + const bool parallelize_by_sharding_dim_only_; // Matrix sizes. const Index m_; const Index n_; @@ -414,6 +562,8 @@ struct TensorEvaluator packed_lhs_[P - 1]; - std::vector packed_rhs_[P - 1]; + + // Handle to the allocated temporary storage for Lhs/Rhs blocks. + BlockMemHandle packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + + // If we choose to parallelize only by the sharding dimension, each thread + // will have it's own "thead local" (not a c++ thread local storage) memory + // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory + // can't be passed to a kernel that might execute on a different thread. + // + // In practice when we are ready to pack memory for the sharding dimension + // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice + // already computed (99% of the time), and we can pack data into the thread + // local storage, and guarantee that all the kernels will be executed + // immediately in the same thread. This significantly increases L1 cache hit + // ratio and reduces pressure on the memory bus. + // + // It's still possible that kernel for the K-th slice will be ready before + // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ + // and packed_rhs_ to allow kernels to be executed later on a thread + // different from the thread that was used for packing. + + // Handle for pre-allocated thread local memory buffers. + BlockMemHandle thread_local_pre_alocated_mem_; + + // Only one of these will be initialized depending on shard_by_col value + // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`). + std::vector lhs_thread_local_pre_allocated_; + std::vector rhs_thread_local_pre_allocated_; + + // How many thread local blocks were already allocated. + std::atomic num_thread_local_allocations_; + const int thread_local_capacity; + + // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of + // unique threads in a system is below or equal to the number of threads in + // a thread pool. We will fallback on dynamic memory allocation after that. + + // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its + // size is equal to the grain size in Lhs/Rhs sharding dimension. + template + class ThreadLocalBlocks { + public: + ThreadLocalBlocks() = default; + + ThreadLocalBlocks(BlockType* base, size_t grain_size) + : is_pre_allocated_(true), + thread_local_pre_allocated_base_(base), + grain_size_(grain_size) {} + + ThreadLocalBlocks(BlockMemHandle mem_handle, + std::vector blocks) + : is_pre_allocated_(false), + mem_handle_(std::move(mem_handle)), + blocks_(std::move(blocks)) {} + + BlockType& block(int grain_index) { + eigen_assert(grain_index >= 0); + eigen_assert(static_cast(grain_index) < size()); + return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] + : blocks_[grain_index]; + } + + void Release(EvalParallelContext& ctx) const { + if (!is_pre_allocated_) { + ctx.kernel_.deallocate(ctx.device_, mem_handle_); + } + } + + size_t size() const { + return is_pre_allocated_ ? grain_size_ : blocks_.size(); + } + + private: + bool is_pre_allocated_; + + // Reuse pre-allocated thread local buffers. + BlockType* thread_local_pre_allocated_base_ = nullptr; + size_t grain_size_ = 0; + + // These will be initialized only if `is_pre_allocated == false`. + BlockMemHandle mem_handle_{}; + std::vector blocks_; + }; + + // ThreadLocalBlocksInitialize callable does custom thread local blocks + // initialization, and will reuse pre-allocated buffers if possible, or will + // dynamically allocate new memory. + // + // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly + // for what side do we plan to do block allocation. + template + class ThreadLocalBlocksInitialize { + static constexpr bool kIsLhs = + !is_rhs && std::is_same::value; + static const bool kIsRhs = + is_rhs && std::is_same::value; + static_assert(kIsLhs || kIsRhs, "Unkown block type"); + + using Blocks = ThreadLocalBlocks; + + public: + ThreadLocalBlocksInitialize(EvalParallelContext& ctx) + : ctx_(ctx), + num_worker_threads_(ctx_.device_.numThreadsInPool()) {} + + void operator()(Blocks& blocks) { + const int n = ctx_.num_thread_local_allocations_.fetch_add( + 1, std::memory_order_relaxed); + + if (n >= num_worker_threads_) { + ThreadLocalBlocksAllocator::allocate(ctx_, blocks); + } else { + ThreadLocalBlocksAllocator::reuse(ctx_, n, blocks); + } + } + + private: + // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to + // TensorContractionKernel::allocateSlices into template specializations. + // Also explicit specializations are not allowed at class scope in C++03, + // EvalCtx type parameter is just a workaround for that limitation. + template + struct ThreadLocalBlocksAllocator; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector rhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( + ctx.device_, + /*num_lhs=*/0, + /*num_rhs=*/ctx.gn_, + /*num_slices=*/1, + /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks); + + blocks = ThreadLocalBlocks(std::move(mem_handle), + std::move(rhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gn_); + } + }; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector lhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( + ctx.device_, + /*num_lhs=*/ctx.gm_, + /*num_rhs=*/0, + /*num_slices=*/1, + /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr); + + blocks = ThreadLocalBlocks(std::move(mem_handle), + std::move(lhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gm_); + } + }; + + EvalParallelContext& ctx_; + const int num_worker_threads_; + }; + + template + class ThreadLocalBlocksRelease { + public: + using Blocks = ThreadLocalBlocks; + ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {} + void operator()(Blocks& blocks) { blocks.Release(ctx_); } + + private: + EvalParallelContext& ctx_; + }; + + // ThreadLocalBlocks initialization callables. + using ThreadLocalLhsInit = + ThreadLocalBlocksInitialize; + using ThreadLocalRhsInit = + ThreadLocalBlocksInitialize; + + // ThreadLocalBlocks release callables. + using ThreadLocalLhsRelease = ThreadLocalBlocksRelease; + using ThreadLocalRhsRelease = ThreadLocalBlocksRelease; + + // Thread local containers for Lhs/Rhs block packs. In practice only one of + // them will be used, depending on the shard_by_col value. + Eigen::ThreadLocal, ThreadLocalLhsInit, + ThreadLocalLhsRelease> + lhs_thread_local_blocks_; + Eigen::ThreadLocal, ThreadLocalRhsInit, + ThreadLocalRhsRelease> + rhs_thread_local_blocks_; + + // After a particular shard for Kth slice missed thread local execution + // opportunity (K-1 slice didn't complete kernels execution), we can no + // longer schedule K+1 and following slices in thread local mode, because + // there is no more guarantee that previous kernels were executed + // sequentially in the same thread (size is nn_ or nm_). + std::atomic* can_use_thread_local_packed_; + std::atomic** state_kernel_[P]; // state_switch_ is frequently modified by worker threads, while other // fields are read-only after constructor. Let's move it to a separate cache @@ -461,69 +817,168 @@ struct TensorEvaluator state_packing_ready_[P]; std::atomic state_switch_[P]; + LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(!shard_by_col_); + ThreadLocalBlocks& blocks = lhs_thread_local_blocks_.local(); + + Index grain_index = m1 - m * gm_; + return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_lhs_[k % (P - 1)][m1]; + } + } + + RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(shard_by_col_); + ThreadLocalBlocks& blocks = rhs_thread_local_blocks_.local(); + + Index grain_index = n1 - n * gn_; + return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_rhs_[k % (P - 1)][n1]; + } + } + + // In following two methods (pack_lhs and pack_rhs), if we know for sure + // that we'll be able to immediately call a kernel with packed data, and do + // not submit it to the thread pool, we can use thread local memory for + // packed data. + // + // We can only reliably check it if we are running all kernels in sync mode + // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to + // run, it's guaranteed that all kernels with larger values of m (n) are + // also ready, because we execute them in the same order for all K slices. + void pack_lhs(Index m, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && + can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in following slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[m].store(false, + std::memory_order_relaxed); + } + } + const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) - LhsPacker()(packed_lhs_[k % (P - 1)][m1], - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { + assert(!use_thread_local); signal_packing(k); } else { signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + for (Index n = nn_ - 1; n >= 0; n--) { + bool sync = parallelize_by_sharding_dim_only_ || n == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } } void pack_rhs(Index n, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && shard_by_col_ && + can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in followig slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[n].store(false, + std::memory_order_relaxed); + } + } + const Index nend = n * gn_ + gn(n); for (Index n1 = n * gn_; n1 < nend; n1++) { - if (k == 0) { - // Zero the output memory in parallel. - // On 10000x2x10000 mm zeroing can easily take half of time. - // Zero (bn x m) row. Safe to do here because all kernels that will - // write to this memory depend on completion of this task. - // Note: don't call device_.memset() here. device_.memset() blocks on - // thread pool worker thread, which can lead to underutilization and - // deadlocks. + if (!TensorContractionKernel::HasBeta && k == 0) { + // Zero the output memory in parallel, only if contraction kernel does + // not support `beta`. Otherwise we will pass beta 0.0 to the first + // call to the `TensorContractionKernel::invoke()`. + // + // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn + // x m) row. Safe to do here because all kernels that will write to + // this memory depend on completion of this task. Note: don't call + // device_.memset() here. device_.memset() blocks on thread pool + // worker thread, which can lead to underutilization and deadlocks. memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); } - RhsPacker()(packed_rhs_[k % (P - 1)][n1], - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); } if (parallel_pack_ || shard_by_col_) { signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); + for (Index m = nm_ - 1; m >= 0; m--) { + bool sync = parallelize_by_sharding_dim_only_ || m == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } else { + assert(!use_thread_local); signal_packing(k); } } - void kernel(Index m, Index n, Index k) { + void kernel(Index m, Index n, Index k, bool use_thread_local) { // Note: order of iteration matters here. Iteration over m is innermost - // because we want to reuse the same packed rhs in consequetive tasks + // because we want to reuse the same packed rhs in consecutive tasks // (rhs fits into L2$ while lhs only into L3$). const Index nend = n * gn_ + gn(n); const Index mend = m * gm_ + gm(m); + + // NOTE: output = alpha * LHS * RHS + beta * output. + const Scalar alpha = Scalar(1); + const Scalar beta = + (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1); + if (shard_by_col_) { for (Index n1 = n * gn_; n1 < nend; n1++) { - for (Index m1 = m * gm_; m1 < mend; m1++) - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + for (Index m1 = m * gm_; m1 < mend; m1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke( + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, + m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } } } else { for (Index m1 = m * gm_; m1 < mend; m1++) for (Index n1 = n * gn_; n1 < nend; n1++) { - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke( + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, + m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } } } - signal_kernel(m, n, k + 1, false); + signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); signal_switch(k + 2); } @@ -536,16 +991,23 @@ struct TensorEvaluator* state = &state_kernel_[k % P][m][n]; Index s = state->load(); eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) return; + if (s != 1 && state->fetch_sub(1) != 1) { + eigen_assert(!use_thread_local); + return; + } state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) - kernel(m, n, k); - else - device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + if (sync) { + kernel(m, n, k, use_thread_local); + } else { + eigen_assert(!use_thread_local); + device_.enqueueNoNotification( + [=]() { kernel(m, n, k, use_thread_local); }); + } } void signal_switch(Index k, Index v = 1) { @@ -595,11 +1057,32 @@ struct TensorEvaluator 1) { + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + end = mid; + } + + // Decide if we want to run first packing task (start == 0) in + // async mode if we parallelize only by sharding dim: + // (1) pack_lhs and pack_rhs call signal_switch before completing + // all calls to signal_kernel, which in sync mode might lead + // to the execution of the first kernel of the k+1 slice, before + // completing a call to the last kernel of the k slice. + // (2) all pack tasks for sharded dim must be executed in a thread + // pool to get pre-allocated thead local buffers. + bool pack_async = + (start == 0) && + (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) && + (k > 0 || std::this_thread::get_id() == created_by_thread_id_); + + if (pack_async) { + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, end, k, rhs); }); + } else { + enqueue_packing_helper(start, end, k, rhs); + } } } @@ -611,10 +1094,364 @@ struct TensorEvaluator + using SyncEvalParallelContext = + EvalParallelContext; + + // ------------------------------------------------------------------------ // + + // EvalShardedByInnerDimContext orchestrates sync/async contraction + // evaluation, when we shard by inner dimension. When it is executed in + // asynchronous mode, it owns all the shared state that might be accessible by + // block processing tasks. + + template + struct EvalShardedByInnerDimContext { + EvalShardedByInnerDimContext(const Self* self, int num_threads, + Scalar* result_buffer, + Index m_size, Index n_size, Index k_size, + DoneCallback done_callback) + : evaluator(self), + m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), + m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), + m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), + result(result_buffer), + m(m_size), + n(n_size), + k(k_size), + done(std::move(done_callback)), + buffer_size_bytes(m * n * sizeof(Scalar)), + block_size(blockSize(k, num_threads)), + num_blocks(divup(k, block_size)), + num_pending_blocks(internal::convert_index(num_blocks)), + l0_ranges(divup(num_blocks, l0_size)), + l0_state(l0_ranges), + block_buffers(num_blocks) { + // Keep count of pending gemm tasks for each l0 range. + for (int i = 0; i < l0_ranges; ++i) { + const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index(num_pending_tasks)); + } + + // Allocate temporary buffers for each block. + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 + ? result + : static_cast(evaluator->m_device.allocate( + buffer_size_bytes)); + block_buffers.emplace_back(buf); + } + } + + ~EvalShardedByInnerDimContext() { + for (Index i = 1; i < num_blocks; ++i) { + evaluator->m_device.deallocate(block_buffers[i]); + } + } + + template + void run() { + Barrier barrier(internal::convert_index(num_blocks)); + eval(barrier, 0, num_blocks); + barrier.Wait(); + + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + } + + template + void runAsync() { + evalAsync(0, num_blocks); + } + + private: + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + static const Index packet_size = internal::packet_traits::size; + + const Self* evaluator; // TensorContraction evaluator + + // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + Scalar* result; + + Index m; + Index n; + Index k; + + DoneCallback done; + + // ----------------------------------------------------------------------// + // Algorithm parameters. + + // We will compute partial results into the buffers of this size. + Index buffer_size_bytes; + + Index block_size; + Index num_blocks; + + // Keep track of pending tasks when evaluate in async mode. + std::atomic num_pending_blocks; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks done, we need to add only these pre-aggregated blocks. + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + Index l0_ranges; + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state; // [0, l0_ranges) + + // Buffers allocated for each temporary block computation. + MaxSizeVector block_buffers; // [0, num_blocks) + + template + void processBlock(Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + + TENSOR_CONTRACTION_DISPATCH( + evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } + } + + // Aggregate partial sums from l0 ranges. + template + void aggregateL0Blocks() const { + Index l0_index = 1; + + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */ block_buffers[0]); + } + + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], + block_buffers[0]); + } + } + + void applyOutputKernel() const { + typedef internal::blas_data_mapper OutputMapper; + evaluator->m_output_kernel( + OutputMapper(result, m), evaluator->m_tensor_contraction_params, + static_cast(0), static_cast(0), m, n); + } + + // Compute block size with accounting for potentially incomplete last block. + Index actualBlockSize(Index block_idx) const { + return block_idx + 1 < num_blocks + ? block_size + : k + block_size - block_size * num_blocks; + }; + + // Compute range size with accounting for potentially incomplete last range. + Index actualRangeSize(Index num_ranges, Index range_size, + Index range_idx) const { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges + ? range_size + : num_blocks + range_size - range_size * num_ranges; + }; + + template + EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, + Scalar* tgt_buf) { + const int output_packet_size = + internal::unpacket_traits::size; + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const PacketReturnType src_val = + internal::pload(src_buf + i); + const PacketReturnType tgt_val = + internal::ploadt(tgt_buf + i); + const PacketReturnType sum = internal::padd(src_val, tgt_val); + internal::pstoret(tgt_buf + i, + sum); + } + for (; i < n; ++i) { + tgt_buf[i] += src_buf[i]; + } + } + + template + EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, + const Scalar* src_buf0, + const Scalar* src_buf1, + const Scalar* src_buf2, + Scalar* dst_buf) { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = + internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = + padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + + template + void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, &barrier, mid_block_idx, end_block_idx]() { + eval(barrier, mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + barrier.Notify(); + } + + template + void evalAsync(Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, mid_block_idx, end_block_idx]() { + evalAsync(mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + + int v = num_pending_blocks.fetch_sub(1); + eigen_assert(v >= 1); + + if (v == 1) { + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + + // NOTE: If we call `done` callback before deleting this (context), + // it might deallocate Self* pointer captured by context, and we'll + // fail in destructor trying to deallocate temporary buffers. + + // Move done call back from context before it will be destructed. + DoneCallback done_copy = std::move(done); + + // We are confident that we are the last one who touches context. + delete this; + + // Now safely call the done callback. + done_copy(); + } + } + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs + // and gemm_pack_rhs, so we specify minimum desired block size. + static Index blockSize(Index k, int num_threads) { + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return divup(index, kmultiple) * kmultiple; + }; + + const Index target_block_size = round_up(divup(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + return numext::mini( + k, numext::maxi(desired_min_block_size, target_block_size)); + } + + EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; + void operator=(const EvalShardedByInnerDimContext&) = delete; }; + // ------------------------------------------------------------------------ // + + // Below are the function used by evalProductImpl heuristics, trying to select + // optimcal parameters for parallelization algorithm. + // Decide whether we want to shard m x n contraction by columns or by rows. static bool shardByCol(Index m, Index n, Index num_threads) { // Note: we are comparing both n and m against Traits::nr, it is not @@ -718,304 +1555,15 @@ struct TensorEvaluator - void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - evalGemm(buffer); - } - - template - void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - - const int lhs_packet_size = internal::unpacket_traits::size; - const int rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // TODO: packing could be faster sometimes if we supported row major tensor mappers - typedef internal::gemm_pack_lhs LhsPacker; - typedef internal::gemm_pack_rhs RhsPacker; - - // TODO: replace false, false with conjugate values? - typedef internal::gebp_kernel GebpKernel; - - typedef internal::packLhsArg packLArg; - typedef internal::packRhsAndKernelArg packRKArg; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // compute block sizes (which depend on number of threads) - const Index num_threads = this->m_device.numThreads(); - internal::TensorContractionBlocking blocking(k, m, n, num_threads); - Index mc = blocking.mc(); - Index nc = blocking.nc(); - Index kc = blocking.kc(); - eigen_assert(mc <= m); - eigen_assert(nc <= n); - eigen_assert(kc <= k); - -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) - const Index k_blocks = CEIL_DIV(k, kc); - const Index n_blocks = CEIL_DIV(n, nc); - const Index m_blocks = CEIL_DIV(m, mc); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - /* cout << "m: " << m << " n: " << n << " k: " << k << endl; - cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; - cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; - cout << "num threads: " << num_threads << endl; - */ - - // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB - // aren't 16 byte aligned segfaults will happen due to SIMD instructions - // note: You can get away with allocating just a single blockA and offsets and meet the - // the alignment requirements with the assumption that - // (Traits::mr * sizeof(ResScalar)) % 16 == 0 - const Index numBlockAs = numext::mini(num_threads, m_blocks); - MaxSizeVector blockAs(num_threads); - for (int i = 0; i < num_threads; i++) { - blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); - } - - // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread - // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. - // Other options: (1) reuse memory when a thread finishes. con: tricky - // (2) allocate block B memory in each thread. con: overhead - MaxSizeVector blockBs(n_blocks); - for (int i = 0; i < n_blocks; i++) { - blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); - } - - // lhs_notifications starts with all null Notifications - MaxSizeVector lhs_notifications(num_threads, nullptr); - - // this should really be numBlockAs * n_blocks; - const Index num_kernel_notifications = num_threads * n_blocks; - MaxSizeVector kernel_notifications(num_kernel_notifications, - nullptr); - - for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { - const Index k_start = k_block_idx * kc; - // make sure we don't overshoot right edge of left matrix - const Index actual_kc = numext::mini(k_start + kc, k) - k_start; - - for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { - const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); - - for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { - const Index m_start = mt_block_idx * mc; - const Index actual_mc = numext::mini(m_start + mc, m) - m_start; - eigen_assert(actual_mc > 0); - - Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; - - for (int i = 0; i < n_blocks; ++i) { - Index notification_id = (blockAId * n_blocks + i); - // Wait for any current kernels using this slot to complete - // before using it. - if (kernel_notifications[notification_id]) { - wait_until_ready(kernel_notifications[notification_id]); - delete kernel_notifications[notification_id]; - } - kernel_notifications[notification_id] = new Notification(); - } - const packLArg arg = { - blockAs[blockAId], // blockA - lhs, // lhs - m_start, // m - k_start, // k - actual_mc, // mc - actual_kc, // kc - }; - - // Delete any existing notification since we may be - // replacing it. The algorithm should ensure that there are - // no existing waiters on this notification. - delete lhs_notifications[blockAId]; - lhs_notifications[blockAId] = - this->m_device.enqueue(&Self::packLhs, arg); - } - - // now start kernels. - const Index m_base_start = m_block_idx * mc; - const bool need_to_pack = m_block_idx == 0; - - for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { - const Index n_start = n_block_idx * nc; - const Index actual_nc = numext::mini(n_start + nc, n) - n_start; - - // first make sure the previous kernels are all done before overwriting rhs. Also wait if - // we're going to start new k. In both cases need_to_pack is true. - if (need_to_pack) { - for (Index i = num_blocks; i < num_threads; ++i) { - Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; - Index future_id = (blockAId * n_blocks + n_block_idx); - wait_until_ready(kernel_notifications[future_id]); - } - } - - packRKArg arg = { - &blockAs, // blockA - blockBs[n_block_idx], // blockB - rhs, // rhs - output, // output - m_base_start, // m - k_start, // k - n_start, // n - mc, // mc - actual_kc, // kc - actual_nc, // nc - num_threads, - numBlockAs, - m, - k_block_idx, - m_block_idx, - n_block_idx, // n_block_idx - m_blocks, // m_blocks - n_blocks, // n_blocks - &kernel_notifications, // kernel notifications - &lhs_notifications, // lhs notifications - need_to_pack, // need_to_pack - }; - - // We asynchronously kick off this function, which ends up - // notifying the appropriate kernel_notifications objects, - // which this thread waits on before exiting. - this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); - } - } - } - - // Make sure all the kernels are done. - for (size_t i = 0; i < kernel_notifications.size(); ++i) { - wait_until_ready(kernel_notifications[i]); - delete kernel_notifications[i]; - } - - // No need to wait for lhs notifications since they should have - // already been waited on. Just clean them up. - for (size_t i = 0; i < lhs_notifications.size(); ++i) { - delete lhs_notifications[i]; - } - - // deallocate all of the memory for both A and B's - for (size_t i = 0; i < blockAs.size(); i++) { - this->m_device.deallocate(blockAs[i]); - } - for (size_t i = 0; i < blockBs.size(); i++) { - this->m_device.deallocate(blockBs[i]); - } - -#undef CEIL_DIV - } - - /* - * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing - * the LHS block, check that all of the kernels that worked on the same - * mt_block_idx in the previous m_block are done. - */ - template - static void packLhs(const packLArg arg) { - // perform actual packing - LhsPacker pack_lhs; - pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); - } - - /* - * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that - * all kernels in the previous block are done. - * Then for each LHS future, we wait on the future and then call GEBP - * on the area packed by the future (which starts at - * blockA + future_idx * mt * kc) on the LHS and with the full packed - * RHS block. - * The output of this GEBP is written to output(m + i * mt, n). - */ - template - static void packRhsAndKernel(packRKArg arg) { - if (arg.need_to_pack) { - RhsPacker pack_rhs; - pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); - } - - GebpKernel gebp; - for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { - const Index m_base_start = arg.m + arg.mc*mt_block_idx; - if (m_base_start < arg.max_m) { - Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; - wait_until_ready((*arg.lhs_notifications)[blockAId]); - const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; - gebp(arg.output.getSubMapper(m_base_start, arg.n), - (*arg.blockAs)[blockAId], arg.blockB, - actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); - - // Notify that the kernel is done. - const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; - (*arg.kernel_notifications)[set_idx]->Notify(); - } - } - } -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, bool prepacked) const { const int packed_size = std::min(PacketType::size, PacketType::size); const int output_packet_size = internal::unpacket_traits::size; const double kd = static_cast(bk); - // Peak VFMA bandwidth is 0.5. However if we have not enough data for - // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined - // experimentally. - double computeBandwidth = bk == 1 ? 4.0 : - (shard_by_col ? bn : bm) < Traits::nr || - (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; -#ifndef EIGEN_VECTORIZE_FMA - // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. - // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, - // so overall bandwidth is 1.0. - if (computeBandwidth == 0.5) computeBandwidth = 1.0; -#endif + double compute_bandwidth = computeBandwidth(false, bm, bn, bk); // Computations. - TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); + TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size); // Output stores. cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); if (prepacked) { @@ -1035,6 +1583,94 @@ struct TensorEvaluator l3CacheSize() / num_threads_by_k || // need more buffer space + // than L3 cache or... + k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny. + shard_by_k = false; + } else if (numext::maxi(m, n) / num_threads < + Traits::nr || // both other dimensions are tiny or... + // k per thread is not small and... + (k / num_threads_by_k > 8 * Traits::nr && + // one of the outer dimensions is tiny or sharding by k offers + // more parallelism. + (numext::mini(m, n) < 2 * Traits::nr || + num_threads_by_k > num_threads))) { + shard_by_k = true; + } + return shard_by_k; + } + + TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { + // Compute cost. + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n; + // Since the inner gemm kernel is always sharded by column, the lhs + // load cost is negligible. + lhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + int numThreadsInnerDim(Index m, Index n, Index k) const { + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost = contractionCostPerInnerDim(m, n, k); + double total_parallel_cost = + TensorCostModel::totalCost(k, cost); + // Cost of reduction step accumulating the m*n per-thread buffers into the + // result. + double reduction_cost = TensorCostModel::totalCost( + m * n, TensorOpCost(2, 1, 1, true, output_packet_size)); + int num_threads = 1; + double min_cost = total_parallel_cost; + double kPerThreadOverHead = 3000; + double kFixedOverHead = 100000; + for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { + double sequential_cost = + kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); + double parallel_cost = total_parallel_cost / nt + sequential_cost; + if (parallel_cost < min_cost) { + num_threads = nt; + min_cost = parallel_cost; + } + } + return num_threads; + } + + double computeBandwidth(bool shard_by_col, Index bm, Index bn, + Index bk) const { + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = + bk == 1 ? 4.0 + : (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr + ? 2.0 + : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such + // instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + return computeBandwidth; + } + }; } // end namespace Eigen diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 860a6949..09d2da9a 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,6 +32,7 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; + typedef typename TypeConversion::PointerType>::type PointerType; }; template @@ -50,7 +51,10 @@ struct nested, 1, typename eval -struct PacketConverter { +struct PacketConverter; + +template +struct PacketConverter { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} @@ -108,7 +112,33 @@ struct PacketConverter { }; template -struct PacketConverter { +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + 1 * SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + SrcPacket src5 = m_impl.template packet(index + 4 * SrcPacketSize); + SrcPacket src6 = m_impl.template packet(index + 5 * SrcPacketSize); + SrcPacket src7 = m_impl.template packet(index + 6 * SrcPacketSize); + SrcPacket src8 = m_impl.template packet(index + 7 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4, src5, src6, src7, src8); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} @@ -128,6 +158,7 @@ struct PacketConverter { typedef typename internal::unpacket_traits::type TgtType; internal::scalar_cast_op converter; EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < TgtPacketSize; ++i) { values[i] = converter(m_impl.coeff(index+i)); } @@ -163,19 +194,114 @@ class TensorConversionOp : public TensorBase struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { +template struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { impl.evalSubExprsIfNeeded(NULL); return true; } }; -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { +template struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } }; +#ifdef EIGEN_USE_THREADS +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); + } +}; + +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +}; +#endif + +namespace internal { + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + return converter(impl.coeff(index)); + } +}; + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + return impl.coeff(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + TargetPacket rslt = internal::pload(values); + return rslt; + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, SrcPacket, TargetPacket, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type TargetType; + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i); + return internal::pload(values); + } +}; + +template +struct PacketConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + return impl.template packet(index); + } +}; + +} // namespace internal // Eval as rvalue template @@ -189,44 +315,98 @@ struct TensorEvaluator, Device> typedef typename internal::remove_all::Scalar>::type SrcType; typedef typename PacketType::type PacketReturnType; typedef typename PacketType::type PacketSourceType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + static const bool IsSameType = internal::is_same::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = true, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = + #ifndef EIGEN_USE_SYCL + true, + #else + TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast, + #endif + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + struct TensorConversionOpBlockFactory { + template + struct XprType { + typedef TensorConversionOp type; + }; + + template + typename XprType::type expr(const ArgXprType& expr) const { + return typename XprType::type(expr); + } }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef internal::TensorUnaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); + return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType data, EvalSubExprsCallback done) { + ConversionSubExprEvalAsync, + EvaluatorPointerType, + EvalSubExprsCallback>::run(m_impl, data, std::move(done)); } +#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - internal::scalar_cast_op converter; - return converter(m_impl.coeff(index)); + return internal::CoeffConv::run(m_impl,index); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const bool Vectorizable = TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; - return PacketConv::run(m_impl, index); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType + packet(Index index) const { + // If we are not going to do the cast, we just need to check that base + // TensorEvaluator has packet access. Otherwise we also need to make sure, + // that we have an implementation of vectorized cast. + const bool Vectorizable = + IsSameType + ? TensorEvaluator::PacketAccess + : int(TensorEvaluator::PacketAccess) & + int(internal::type_casting_traits::VectorizedCast); + + return internal::PacketConv::run(m_impl, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -244,33 +424,30 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } - protected: - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_impl.block(desc, scratch), + TensorConversionOpBlockFactory()); + } - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; - PacketConverter, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet(index); - } - }; + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: TensorEvaluator m_impl; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index abdf742c..b20f80ba 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -54,8 +54,8 @@ class IndexMapper { } } - array cudaInputDimensions; - array cudaOutputDimensions; + array gpuInputDimensions; + array gpuOutputDimensions; array tmp = dimensions; array ordering; const size_t offset = static_cast(Layout) == static_cast(ColMajor) @@ -65,8 +65,8 @@ class IndexMapper { const Index index = i + offset; ordering[index] = indices[i]; tmp[indices[i]] = -1; - cudaInputDimensions[index] = input_dims[indices[i]]; - cudaOutputDimensions[index] = dimensions[indices[i]]; + gpuInputDimensions[index] = input_dims[indices[i]]; + gpuOutputDimensions[index] = dimensions[indices[i]]; } int written = static_cast(Layout) == static_cast(ColMajor) @@ -75,8 +75,8 @@ class IndexMapper { for (int i = 0; i < NumDims; ++i) { if (tmp[i] >= 0) { ordering[written] = i; - cudaInputDimensions[written] = input_dims[i]; - cudaOutputDimensions[written] = dimensions[i]; + gpuInputDimensions[written] = input_dims[i]; + gpuOutputDimensions[written] = dimensions[i]; ++written; } } @@ -89,37 +89,37 @@ class IndexMapper { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { if (i > NumKernelDims) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + m_gpuInputStrides[i] = + m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; } } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + if (static_cast(i + 1) < offset) { + m_gpuInputStrides[i] = + m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; } } } } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaInputStrides[d]; + const Index idx = p / m_gpuInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + p -= idx * m_gpuInputStrides[d]; } inputIndex += p * m_inputStrides[NumKernelDims]; } else { @@ -128,22 +128,22 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaInputStrides[d]; + const Index idx = p / m_gpuInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + p -= idx * m_gpuInputStrides[d]; } inputIndex += p * m_inputStrides[limit]; } return inputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { Index outputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaOutputStrides[d]; + const Index idx = p / m_gpuOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; } outputIndex += p * m_outputStrides[NumKernelDims]; } else { @@ -152,44 +152,44 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaOutputStrides[d]; + const Index idx = p / m_gpuOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; } outputIndex += p * m_outputStrides[limit]; } return outputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -197,7 +197,7 @@ class IndexMapper { k * m_inputStrides[offset + 2]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -209,8 +209,8 @@ class IndexMapper { static const int NumDims = internal::array_size::value; array m_inputStrides; array m_outputStrides; - array m_cudaInputStrides; - array m_cudaOutputStrides; + array m_gpuInputStrides; + array m_gpuOutputStrides; }; @@ -231,6 +231,8 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; enum { Flags = 0 @@ -300,17 +302,25 @@ struct TensorEvaluator::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + IsAligned = int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & int(TensorEvaluator::PacketAccess), + BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -374,12 +384,12 @@ struct TensorEvaluator EvalTo; EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); + const bool Vectorize = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); m_kernel = local; m_local_kernel = true; @@ -544,14 +554,14 @@ struct TensorEvaluator struct GetKernelSize { @@ -568,13 +578,17 @@ struct GetKernelSize { template -__global__ void EigenConvolutionKernel1D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; @@ -586,18 +600,18 @@ __global__ void EigenConvolutionKernel1D( for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.y * num_x_input; #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x); s[i + plane_kernel_offset] = eval.coeff(tensor_index); } __syncthreads(); // Compute the convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { @@ -607,7 +621,7 @@ __global__ void EigenConvolutionKernel1D( for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { result += s[k + kernel_offset] * kernel[k]; } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x); buffer[tensor_index] = result; } __syncthreads(); @@ -616,14 +630,18 @@ __global__ void EigenConvolutionKernel1D( template -__global__ void EigenConvolutionKernel2D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; @@ -640,7 +658,7 @@ __global__ void EigenConvolutionKernel2D( for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.z * num_y_input; // Load inputs to shared memory @@ -649,7 +667,7 @@ __global__ void EigenConvolutionKernel2D( const int input_offset = num_x_input * (j + plane_kernel_offset); #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y); s[i + input_offset] = eval.coeff(tensor_index); } } @@ -657,7 +675,7 @@ __global__ void EigenConvolutionKernel2D( __syncthreads(); // Convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -673,7 +691,7 @@ __global__ void EigenConvolutionKernel2D( result += s[k + input_offset] * kernel[k + kernel_offset]; } } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y); buffer[tensor_index] = result; } } @@ -683,7 +701,7 @@ __global__ void EigenConvolutionKernel2D( }; template -__global__ void EigenConvolutionKernel3D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( InputEvaluator eval, const internal::IndexMapper indexMapper, @@ -691,7 +709,11 @@ __global__ void EigenConvolutionKernel3D( const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif // Load inputs to shared memory const int first_x = blockIdx.x * maxX; @@ -708,13 +730,13 @@ __global__ void EigenConvolutionKernel3D( for (int p = 0; p < numPlanes; ++p) { - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = 0; for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); } } @@ -726,7 +748,7 @@ __global__ void EigenConvolutionKernel3D( const int num_z_output = last_z - first_z + 1; const int num_y_output = last_y - first_y + 1; const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -739,7 +761,7 @@ __global__ void EigenConvolutionKernel3D( } } } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); buffer[tensor_index] = result; } } @@ -764,13 +786,19 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -852,9 +880,9 @@ struct TensorEvaluator::Dimensions InputDims; const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); + const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); const int warpSize = 32; switch (NumKernelDims) { @@ -889,7 +917,7 @@ struct TensorEvaluator, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); break; } case 7: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); } } break; @@ -946,7 +974,7 @@ struct TensorEvaluator(1024/(block_size.x*block_size.y), maxP); const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); + gpu_assert(shared_mem <= maxSharedMem); const int num_x_blocks = ceil(numX, maxX); const int num_y_blocks = ceil(numY, maxY); @@ -967,11 +995,11 @@ struct TensorEvaluator, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); break; } } @@ -980,18 +1008,18 @@ struct TensorEvaluator, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); break; } } break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); break; } } @@ -1026,7 +1054,7 @@ struct TensorEvaluator indices(m_indices[idxX], m_indices[idxY], @@ -1037,7 +1065,7 @@ struct TensorEvaluator indexMapper( m_inputImpl.dimensions(), kernel_dims, indices); - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 00000000..033318fd --- /dev/null +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,544 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ + +enum class convolution_type { CONV1D, CONV2D, CONV3D }; +template +struct EigenConvolutionKernel; +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const size_t kernelSize; + const cl::sycl::range<2> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const size_t kernelSize_, const cl::sycl::range<2> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernelSize(kernelSize_), + input_range(input_range_) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { + return (boolean_check[0] && boolean_check[1]); + } + void operator()(cl::sycl::nd_item<2> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); + const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; + const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; + const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); + /// fill the shared memory + for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset; + const size_t tensor_index = + plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); + + local_acc[local_index] = + (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution // output start x + const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + const size_t index = plane_kernel_offset + itemID.get_local_id(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<2> kernel_size; + const cl::sycl::range<3> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const auto num_input = cl::sycl::range<2>{ + (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; + + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); + const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; + + const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + // fill the local memory + bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); + bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1]); + local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && + in_range_dim1 && in_range_dim2) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // output offset start for each thread + const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + + for (size_t j = 0; j < kernel_size[1]; j++) { + size_t kernel_offset = kernel_size[0] * j; + const size_t index = + (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); + for (size_t i = 0; i < kernel_size[0]; i++) { + result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], + itemID.get_local_id(1) + output_offset[1]); + + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<3> kernel_size; + const cl::sycl::range<3> input_range; + const size_t numP; + + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, + const size_t numP_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_), + numP(numP_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; + + const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; + + const auto output_offset = + cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; + + for (size_t p = 0; p < numP; p++) { + /// fill the shared memory + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { + size_t local_index_dim2 = num_input[0] * num_input[1] * k; + bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); + size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); + const size_t local_index = local_index_dim1 + i; + const size_t tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1], k + input_offset[2]); + local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); + } + } + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + for (size_t k = 0; k < kernel_size[2]; k++) { + for (size_t j = 0; j < kernel_size[1]; j++) { + for (size_t i = 0; i < kernel_size[0]; i++) { + const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); + const size_t local_index = + ((i + itemID.get_local_id(0)) + + num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); + + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + +template +struct TensorEvaluator, Eigen::SyclDevice> { + typedef TensorConvolutionOp XprType; + + static const int NumDims = + internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory KernelStorage; + + enum { + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) + : m_inputImpl(op.inputExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernelImpl(op.kernelExpression(), device), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions &input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions &kernel_dims = + m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (EvaluatorPointerType)m_device.get( + (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate_temp(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate_temp(m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + typename KernelStorage::Type in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_device.get(local), m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { + typedef TensorEvaluator InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const auto input_dim = std::array{numX, numP}; + auto global_range = cl::sycl::range<2>{}; + auto local_range = cl::sycl::range<2>{}; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + + m_device.parallel_for_setup(input_dim, global_range, local_range); + const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[0]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + typedef EigenConvolutionKernel + ConvKernel; + + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); + break; + } + + case 2: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0}; + auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numP = dimensions().TotalSize() / (numX * numY); + auto input_dim = std::array{numX, numY, numP}; + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + + const size_t local_memory_size = + (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; + const array kernel_dims{ + {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); + break; + } + + case 3: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 2, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0}; + + auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]], + (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; + + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; + auto input_dim = std::array{numX, numY, numZ}; + const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); + + const array indices{ + {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], + m_kernelImpl.dimensions()[kernel_index[1]], + m_kernelImpl.dimensions()[kernel_index[2]]}}; + + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + auto local_memory_range = (local_range + kernel_size - 1); + const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; + + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_kernelImpl.bind(cgh); + m_inputImpl.bind(cgh); + m_buf.bind(cgh); + m_kernel.bind(cgh); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator &operator=(const TensorEvaluator &); + TensorEvaluator m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + EvaluatorPointerType m_buf; + typename KernelStorage::Type m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; +}; // namespace Eigen + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 83c449cf..195267ce 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,8 +174,11 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini(threads, GenericNumTraits::highest()); + return numext::mini(max_threads, + numext::maxi(1, static_cast(threads))); } // taskSize assesses parallel task size. @@ -186,14 +189,13 @@ class TensorCostModel { return totalCost(output_size, cost_per_coeff) / kTaskSize; } - private: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( double output_size, const TensorOpCost& cost_per_coeff) { // Cost of memory fetches from L2 cache. 64 is typical cache line size. // 11 is L2 cache latency on Haswell. // We don't know whether data is in L1, L2 or L3. But we are most interested // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not intersting + // is too small for parallelization, larger time is not interesting // either because we are probably using all available threads already). // And for the target time range, L2 seems to be what matters. Data set // fitting into L1 is too small to take noticeable time. Data set fitting diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index e020d076..95a8a84e 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -30,12 +30,13 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename traits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorCustomUnaryOp& type; + typedef const TensorCustomUnaryOpEIGEN_DEVICE_REF type; }; template @@ -86,18 +87,26 @@ struct TensorEvaluator, Devi typedef typename internal::remove_const::type Scalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { m_dimensions = op.func().dimensions(op.expression()); @@ -105,21 +114,21 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (data) { evalTo(data); return false; } else { - m_result = static_cast( - m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); evalTo(m_result); return true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { - m_device.deallocate(m_result); + EIGEN_STRONG_INLINE void cleanup() { + if (m_result) { + m_device.deallocate_temp(m_result); m_result = NULL; } } @@ -138,19 +147,25 @@ struct TensorEvaluator, Devi return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_result.bind(cgh); + } +#endif protected: - EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { - TensorMap > result( - data, m_dimensions); + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.expression(), result, m_device); } Dimensions m_dimensions; const ArgType m_op; - const Device& m_device; - CoeffReturnType* m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; @@ -180,6 +195,8 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -242,18 +259,27 @@ struct TensorEvaluator::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); @@ -261,20 +287,21 @@ struct TensorEvaluator(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); evalTo(m_result); return true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_result != NULL) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } } @@ -293,18 +320,25 @@ struct TensorEvaluator > result(data, m_dimensions); + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); } Dimensions m_dimensions; const XprType m_op; - const Device& m_device; - CoeffReturnType* m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 29e50a3b..96fa46c8 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -28,6 +28,8 @@ template class TensorDevice { public: TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice) + template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp Assign; @@ -63,6 +65,73 @@ template class TensorDevice { ExpressionType& m_expression; }; +/** \class TensorAsyncDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its + * argument asynchronously on the specified device. Currently only + * ThreadPoolDevice implements proper asynchronous execution, while the default + * and GPU devices just run the expression synchronously and call m_done() on + * completion.. + * + * Example: + * auto done = []() { ... expression evaluation done ... }; + * C.device(thread_pool_device, std::move(done)) = A + B; + */ + +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, + DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorExecutor Executor; + + Assign assign(m_expression, other); + Executor::run(assign, m_device); + m_done(); + + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; + + +#ifdef EIGEN_USE_THREADS +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, + DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorAsyncExecutor Executor; + + // WARNING: After assignment 'm_done' callback will be in undefined state. + Assign assign(m_expression, other); + Executor::runAsync(assign, m_device, std::move(m_done)); + + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; +#endif + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 4f5767bc..f7792393 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -1,337 +1,6 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - -namespace Eigen { - -static const int kCudaScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// CUDA streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const cudaStream_t& stream() const = 0; - virtual const cudaDeviceProp& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -static cudaDeviceProp* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. -#if __cplusplus >= 201103L - static std::atomic first(true); - if (first.exchange(false)) { -#else - static bool first = true; - if (first) { - first = false; -#endif - // We're the first thread to reach this point. - int num_devices; - cudaError_t status = cudaGetDeviceCount(&num_devices); - if (status != cudaSuccess) { - std::cerr << "Failed to get the number of CUDA devices: " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - m_deviceProperties = new cudaDeviceProp[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - if (status != cudaSuccess) { - std::cerr << "Failed to initialize CUDA device #" - << i - << ": " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - } - -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_release); -#endif - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_acquire); -#endif - sleep(1); - } - } - } -} - -static const cudaStream_t default_stream = cudaStreamDefault; - -class CudaStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - cudaGetDevice(&device_); - initializeDeviceProp(); - } - // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - cudaGetDevice(&device_); - } else { - int num_devices; - cudaError_t err = cudaGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(device < num_devices); - device_ = device; - } - initializeDeviceProp(); - } - - virtual ~CudaStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const cudaStream_t& stream() const { return *stream_; } - const cudaDeviceProp& deviceProperties() const { - return m_deviceProperties[device_]; - } - virtual void* allocate(size_t num_bytes) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - void* result; - err = cudaMalloc(&result, num_bytes); - assert(err == cudaSuccess); - assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(buffer != NULL); - err = cudaFree(buffer); - assert(err == cudaSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kCudaScratchSize; - semaphore_ = reinterpret_cast(scratch); - cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - return semaphore_; - } - - private: - const cudaStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const cudaStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. - return firstLevelCacheSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) - cudaError_t err = cudaStreamSynchronize(stream_->stream()); - if (err != cudaSuccess) { - std::cerr << "Error detected in CUDA stream: " - << cudaGetErrorString(err) - << std::endl; - assert(err == cudaSuccess); - } -#else - assert(false && "The default device should be used instead to generate kernel code"); +#if defined(__clang__) || defined(__GNUC__) +#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file" #endif - } - - EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the CUDA runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#ifdef __CUDACC__ - cudaError_t error = cudaStreamQuery(stream_->stream()); - return (error == cudaSuccess) || (error == cudaErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); - - -// FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ -static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ - cudaError_t status = cudaDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H +#include "TensorDeviceGpu.h" diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 9d141395..46b9d3ab 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -20,6 +20,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return allocate(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + deallocate(buffer); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); @@ -33,11 +39,18 @@ struct DefaultDevice { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running on the host CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; #else // Running on a CUDA device return 32; @@ -45,9 +58,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running on the host CPU return l1CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 48*1024; // FIXME : update this number for HIP #else // Running on a CUDA device, return the amount of shared memory available. return 48*1024; @@ -55,9 +71,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running single threaded on the host CPU return l3CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return firstLevelCacheSize(); // FIXME : update this number for HIP #else // Running on a CUDA device return firstLevelCacheSize(); @@ -65,13 +84,17 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; #else // Running on a CUDA device - return __CUDA_ARCH__ / 100; + return EIGEN_CUDA_ARCH / 100; #endif } }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h new file mode 100644 index 00000000..ec2e3cb1 --- /dev/null +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -0,0 +1,389 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H + +// This header file container defines fo gpu* macros which will resolve to +// their equivalent hip* or cuda* versions depending on the compiler in use +// A separate header (included at the end of this file) will undefine all +#include "TensorGpuHipCudaDefines.h" + +namespace Eigen { + +static const int kGpuScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP / CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const gpuStream_t& stream() const = 0; + virtual const gpuDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +class GpuDeviceProperties { + public: + GpuDeviceProperties() : + initialized_(false), first_(true), device_properties_(nullptr) {} + + ~GpuDeviceProperties() { + if (device_properties_) { + delete[] device_properties_; + } + } + + EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { + return device_properties_[device]; + } + + EIGEN_STRONG_INLINE bool isInitialized() const { + return initialized_; + } + + void initialize() { + if (!initialized_) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. + if (first_.exchange(false)) { + // We're the first thread to reach this point. + int num_devices; + gpuError_t status = gpuGetDeviceCount(&num_devices); + if (status != gpuSuccess) { + std::cerr << "Failed to get the number of GPU devices: " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + device_properties_ = new gpuDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" + << i + << ": " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + } + + std::atomic_thread_fence(std::memory_order_release); + initialized_ = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!initialized_) { + std::atomic_thread_fence(std::memory_order_acquire); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + } + } + + private: + volatile bool initialized_; + std::atomic first_; + gpuDeviceProp_t* device_properties_; +}; + +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { + static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); + if (!deviceProperties->isInitialized()) { + deviceProperties->initialize(); + } + return *deviceProperties; +} + +EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { + return GetGpuDeviceProperties().get(device); +} + +static const gpuStream_t default_stream = gpuStreamDefault; + +class GpuStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + gpuGetDevice(&device_); + } + // Use the default stream on the specified device + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + GpuStreamDevice(const gpuStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + gpuGetDevice(&device_); + } else { + int num_devices; + gpuError_t err = gpuGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(device < num_devices); + device_ = device; + } + } + + virtual ~GpuStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const gpuStream_t& stream() const { return *stream_; } + const gpuDeviceProp_t& deviceProperties() const { + return GetGpuDeviceProperties(device_); + } + virtual void* allocate(size_t num_bytes) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + void* result; + err = gpuMalloc(&result, num_bytes); + gpu_assert(err == gpuSuccess); + gpu_assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(buffer != NULL); + err = gpuFree(buffer); + gpu_assert(err == gpuSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kGpuScratchSize; + semaphore_ = reinterpret_cast(scratch); + gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + return semaphore_; + } + + private: + const gpuStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const gpuStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + stream_->deallocate(buffer); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip/cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuStreamSynchronize(stream_->stream()); + if (err != gpuSuccess) { + std::cerr << "Error detected in GPU stream: " + << gpuGetErrorString(err) + << std::endl; + gpu_assert(err == gpuSuccess); + } +#else + gpu_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the GPU runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef EIGEN_GPUCC + gpuError_t error = gpuStreamQuery(stream_->stream()); + return (error == gpuSuccess) || (error == gpuErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#if defined(EIGEN_HIPCC) + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ + gpu_assert(hipGetLastError() == hipSuccess); + +#else + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + gpu_assert(cudaGetLastError() == cudaSuccess); + +#endif + +// FIXME: Should be device and kernel specific. +#ifdef EIGEN_GPUCC +static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t status = gpuDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + gpu_assert(status == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +// undefine all the gpu* macros we defined at the beginning of the file +#include "TensorGpuHipCudaUndefines.h" + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 7c039890..df591c21 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,109 +14,1035 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#include namespace Eigen { -struct SyclDevice { - /// class members - /// sycl queue - mutable cl::sycl::queue m_queue; - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// creating device by using selector - template SyclDevice(dev_Selector s) - : -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; + +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type( + queue.get_device() + .template get_info()), + max_work_item_sizes( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_item_sizes>()), + max_mem_alloc_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_mem_alloc_size>()), + max_compute_units(queue.get_device() + .template get_info< + cl::sycl::info::device::max_compute_units>()), + max_work_group_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_group_size>()), + local_mem_size( + queue.get_device() + .template get_info()), + platform_name(queue.get_device() + .get_platform() + .template get_info()), + device_name(queue.get_device() + .template get_info()), + device_vendor( + queue.get_device() + .template get_info()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; +}; + +} // end namespace internal +} // end namespace TensorSycl + +typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() + -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = + platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), + platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = + (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || + device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } + } + return supported_devices; +#endif +} + +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(dev_or_sel, handler), +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog(m_queue.get_context(), get_sycl_supported_devices()), +#endif + m_thread_pool(num_threads), + m_device_info(m_queue) { +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog.build_with_kernel_type(); + auto f = [&](cl::sycl::handler &cgh) { + cgh.single_task(m_prog.get_kernel(), + [=]() {}) + }; + EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); +#endif + } + + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface(dev_or_sel, + [this](cl::sycl::exception_list l) { + this->exception_caught_ = this->sycl_async_handler(l); + }, + num_threads) {} + +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } +#endif + + /// Attach an existing buffer to the pointer map, Eigen will not reuse it + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + std::lock_guard lock(pmapper_mutex_); + return static_cast(pMapper.add_pointer(buf)); + } + + /// Detach previously attached buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); + } + + /// Allocating device pointer. This pointer is actually an 8 bytes host + /// pointer used as key to access the sycl device buffer. The reason is that + /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode + /// expressions. So we create a key pointer to be used in Eigen expression + /// construction. When we convert the Eigen construction into the sycl + /// construction we use this pointer as a key in our buffer_map and we make + /// sure that we dedicate only one buffer only for this pointer. The device + /// pointer would be deleted by calling deallocate function. + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + if (scratch_buffers.empty()) { + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + ; + } else { + for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { + auto buff = pMapper.get_buffer(*it); + if (buff.get_size() >= num_bytes) { + auto ptr = *it; + scratch_buffers.erase(it); + return ptr; + } else { + ++it; } + } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } - })) #else - m_queue(cl::sycl::queue(s)) + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); #endif - {} - // destructor - ~SyclDevice() { deallocate_all(); } + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return get_range_accessor(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return static_cast(data.get_virtual_pointer()); + } - template void deallocate(T *p) const { - auto it = buffer_map.find(p); - if (it != buffer_map.end()) { - buffer_map.erase(it); - internal::aligned_free(p); + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.insert(p); +#else + TensorSycl::internal::SYCLfree(p, pMapper); +#endif + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &p) const { + deallocate_temp(p.get_virtual_pointer()); + } + + /// This is used to deallocate the device pointer. p is used as a key inside + /// the map to find the device buffer and delete it. + EIGEN_STRONG_INLINE void deallocate(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); + } + + EIGEN_STRONG_INLINE void deallocate_all() const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfreeAll(pMapper); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif + } + + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + write_accessor; + if (n == 0) { + if (callback) callback(); + return; } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + write_accessor dst_acc = get_range_accessor(cgh, dst, n); + buffer_scalar_t const *ptr = static_cast(src); + auto non_deleter = [](buffer_scalar_t const *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(s_ptr, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); } - void deallocate_all() const { - std::map>::iterator it=buffer_map.begin(); - while (it!=buffer_map.end()) { - auto p=it->first; - buffer_map.erase(it); - internal::aligned_free(const_cast(p)); - it=buffer_map.begin(); + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + read_accessor; + if (n == 0) { + if (callback) callback(); + return; } - buffer_map.clear(); + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + read_accessor src_acc = get_range_accessor(cgh, src, n); + buffer_scalar_t *ptr = static_cast(dst); + auto non_deleter = [](buffer_scalar_t *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(src_acc, s_ptr); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); } - /// creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - ///the function then adds an entry by creating a sycl buffer for that particular pointer. - template inline cl::sycl::accessor - get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { - return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto src_acc = get_range_accessor(cgh, src, n); + auto dst_acc = get_range_accessor(cgh, dst, n); + cgh.copy(src_acc, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); } - template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { - using Type = cl::sycl::buffer; - std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), - [](void *dataMem) { delete static_cast(dataMem); }))); - (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); - return ret; + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto dst_acc = get_range_accessor(cgh, data, n); + // The cast to uint8_t is here to match the behaviour of the standard + // memset. The cast to buffer_scalar_t is needed to match the type of the + // accessor (in case buffer_scalar_t is not uint8_t) + cgh.fill(dst_acc, static_cast(static_cast(c))); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); } - template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { - return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); + /// Get a range accessor to the virtual pointer's device memory. This range + /// accessor will allow access to the memory from the pointer to the end of + /// the buffer. + /// + /// NOTE: Inside a kernel the range accessor will always be indexed from the + /// start of the buffer, so the offset in the accessor is only used by + /// methods like handler::copy and will not be available inside a kernel. + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef TensorSycl::internal::RangeAccess ret_type; + typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; + + std::lock_guard lock(pmapper_mutex_); + + auto original_buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + const ptrdiff_t typed_offset = offset / sizeof(T); + eigen_assert(typed_offset >= 0); + const auto typed_size = original_buffer.get_size() / sizeof(T); + auto buffer = original_buffer.template reinterpret< + typename Eigen::internal::remove_const::type>( + cl::sycl::range<1>(typed_size)); + const ptrdiff_t size = buffer.get_count() - typed_offset; + eigen_assert(size >= 0); + typedef cl::sycl::accessor::type, + 1, AcMd, global_access, is_place_holder> + placeholder_accessor_t; + const auto start_ptr = static_cast(ptr) - offset; + return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), + cl::sycl::id<1>(typed_offset)), + static_cast(typed_offset), + reinterpret_cast(start_ptr)); } - /// allocating memory on the cpu - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { - return internal::aligned_malloc(8); + /// Get a range accessor to the virtual pointer's device memory with a + /// specified size. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_range_accessor(cl::sycl::handler &cgh, const void *ptr, + const Index n_bytes) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + eigen_assert(n_bytes >= 0); + std::lock_guard lock(pmapper_mutex_); + auto buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset >= 0); + eigen_assert(offset + n_bytes <= buffer.get_size()); + return buffer.template get_access( + cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); } - // some runtime conditions that can be applied here - bool isDeviceSuitable() const { return true; } + /// Creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if + /// not, the function then adds an entry by creating a sycl buffer for that + /// particular pointer. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr) + .template get_access( + cgh); + } + + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr); + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_offset(ptr); + } + + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, + const Rhs &rhs, OutPtr outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + lhs.bind(cgh); + rhs.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, + OutPtr &outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + + EIGEN_STRONG_INLINE void synchronize() const { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + + + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { + set_latest_event(e); +#ifndef EIGEN_SYCL_ASYNC_EXECUTION + synchronize(); +#endif + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(tileSize)); + rng = n; + if (rng == 0) rng = static_cast(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[1] = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[2] = + static_cast(std::pow(2, static_cast(pow_of_2 / 3))); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast(global_range[2] % local_range[2]); + if (xMode != 0) + global_range[2] += static_cast(local_range[2] - xMode); + } + pow_of_2 = static_cast( + std::log2(static_cast(max_workgroup_Size / local_range[2]))); + local_range[1] = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / + (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); + } + } + + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == + cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { + return m_device_info.max_mem_alloc_size; + } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_device_info.max_compute_units; + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_device_info.max_work_group_size; + } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return m_device_info.max_work_item_sizes; + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_device_info.local_mem_size; + } + + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - ::memcpy(dst, src, n); + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return m_device_info.platform_name; } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { - auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); - memcpy(host_acc.get_pointer(), src, n); + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return m_device_info.device_name; } - /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { - auto it = buffer_map.find(src); - if (it != buffer_map.end()) { - auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); - memcpy(dst,host_acc.get_pointer(), n); - } else{ - eigen_assert("no device memory found. The memory might be destroyed before creation"); + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return m_device_info.device_vendor; + } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + synchronize(); } + return !exception_caught_; + } + + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + return latest_events_[std::this_thread::get_id()]; +#else + eigen_assert(false); + return cl::sycl::event(); +#endif + } + + // destructor + ~QueueInterface() { + pMapper.clear(); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { - ::memset(buffer, c, n); + protected: + EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + latest_events_[std::this_thread::get_id()] = e; +#else + EIGEN_UNUSED_VARIABLE(e); +#endif + } + + void synchronize_and_callback(cl::sycl::event e, + const std::function &callback) const { + set_latest_event(e); + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return 1; + + bool sycl_async_handler(cl::sycl::exception_list exceptions) const { + bool exception_caught = false; + for (const auto &e : exceptions) { + if (e) { + exception_caught = true; + EIGEN_THROW_X(e); + } + } + return exception_caught; } + + /// class members: + bool exception_caught_ = false; + + mutable std::mutex pmapper_mutex_; + +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + mutable std::mutex event_mutex_; + mutable std::unordered_map latest_events_; +#endif + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of + /// SyclDevice. If a non-read-only pointer is needed to be accessed on the + /// host we should manually deallocate it. + mutable TensorSycl::internal::PointerMapper pMapper; +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + mutable std::unordered_set scratch_buffers; +#endif + /// sycl queue + mutable cl::sycl::queue m_queue; +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + mutable cl::sycl::program m_prog; +#endif + + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; }; +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) + : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { + return m_queue_stream; + } +}; + +// Here is a sycl device struct which accept the sycl queue interface +// as an input +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) + : SyclDeviceBase(queue_stream) {} + + // this is the accessor used to construct the evaluator + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + return queue_stream()->template get_range_accessor(ptr); + } + + // get sycl accessor + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + return queue_stream()->template get_sycl_accessor(cgh, ptr); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + return queue_stream()->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { + return queue_stream()->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return queue_stream()->allocate_temp(num_bytes); + } + + /// deallocate device memory + EIGEN_STRONG_INLINE void deallocate(void *p) const { + queue_stream()->deallocate(p); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { + queue_stream()->deallocate_temp(buffer); + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &buffer) const { + queue_stream()->deallocate_temp(buffer); + } + EIGEN_STRONG_INLINE void deallocate_all() const { + queue_stream()->deallocate_all(); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return queue_stream()->get(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return queue_stream()->get(data); + } + + /// attach existing buffer + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + return queue_stream()->attach_buffer(buf); + } + /// detach buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + queue_stream()->detach_buffer(p); + } + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + return queue_stream()->get_offset(ptr); + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + + /// memcpyHostToDevice + template + EIGEN_STRONG_INLINE void memcpyHostToDevice( + Index *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); + } + /// memcpyDeviceToHost + template + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); + } + /// the memcpy function + template + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); + } + /// the memset function + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + queue_stream()->memset(data, c, n); + } + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { + return queue_stream()->sycl_queue(); + } +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { + return queue_stream()->program(); + } +#endif + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on sycl devices. + return firstLevelCacheSize(); + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return queue_stream()->getNumSyclMultiProcessors(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return queue_stream()->maxSyclThreadsPerBlock(); + } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return queue_stream()->maxWorkItemSizes(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); + } + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return queue_stream()->sharedMemPerBlock(); + } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); + } + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return queue_stream()->majorDeviceVersion(); + } + + EIGEN_STRONG_INLINE void synchronize() const { + queue_stream()->synchronize(); + } + EIGEN_STRONG_INLINE void async_synchronize( + cl::sycl::event e = cl::sycl::event()) const { + queue_stream()->async_synchronize(e); + } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { + return queue_stream()->get_latest_event(); + } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { + return queue_stream()->has_local_memory(); + } + EIGEN_STRONG_INLINE long max_buffer_size() const { + return queue_stream()->max_buffer_size(); + } + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return queue_stream()->getPlatformName(); + } + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return queue_stream()->getDeviceName(); + } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return queue_stream()->getDeviceVendor(); + } + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { + queue_stream()->template binary_kernel_launcher( + var...); + } + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { + queue_stream()->template unary_kernel_launcher( + var...); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { + queue_stream()->template nullary_kernel_launcher( + var...); + } +}; } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index a5e084a2..e524b535 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,67 +12,6 @@ namespace Eigen { -// Use the SimpleThreadPool by default. We'll switch to the new non blocking -// thread pool later. -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template using ThreadPoolTempl = NonBlockingThreadPoolTempl; -typedef NonBlockingThreadPool ThreadPool; -#else -template using ThreadPoolTempl = SimpleThreadPoolTempl; -typedef SimpleThreadPool ThreadPool; -#endif - - -// Barrier is an object that allows one or more threads to wait until -// Notify has been called a specified number of times. -class Barrier { - public: - Barrier(unsigned int count) : state_(count << 1), notified_(false) { - eigen_assert(((count << 1) >> 1) == count); - } - ~Barrier() { - eigen_plain_assert((state_>>1) == 0); - } - - void Notify() { - unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; - if (v != 1) { - eigen_assert(((v + 2) & ~1) != 0); - return; // either count has not dropped to 0, or waiter is not waiting - } - std::unique_lock l(mu_); - eigen_assert(!notified_); - notified_ = true; - cv_.notify_all(); - } - - void Wait() { - unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); - if ((v >> 1) == 0) return; - std::unique_lock l(mu_); - while (!notified_) { - cv_.wait(l); - } - } - - private: - std::mutex mu_; - std::condition_variable cv_; - std::atomic state_; // low bit is waiter flag - bool notified_; -}; - - -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object, -// but only one caller must call Notify() on the object. -struct Notification : Barrier { - Notification() : Barrier(1) {}; -}; - - // Runs an arbitrary function and then calls Notify() on the passed in // Notification. template struct FunctionWrapperWithNotification @@ -102,22 +41,75 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { } } +// An abstract interface to a device specific memory allocator. +class Allocator { + public: + virtual ~Allocator() {} + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; +}; // Build a thread pool device on top the an existing pool of threads. struct ThreadPoolDevice { // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr) + : pool_(pool), num_threads_(num_cores), allocator_(allocator) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); + return allocator_ ? allocator_->allocate(num_bytes) + : internal::aligned_malloc(num_bytes); } EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); + if (allocator_) { + allocator_->deallocate(buffer); + } else { + internal::aligned_free(buffer); + } + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + deallocate(buffer); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; } EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifdef __ANDROID__ ::memcpy(dst, src, n); +#else + // TODO(rmlarsen): Align blocks on cache lines. + // We have observed that going beyond 4 threads usually just wastes + // CPU cycles due to the threads competing for memory bandwidth, so we + // statically schedule at most 4 block copies here. + const size_t kMinBlockSize = 32768; + const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); + if (n <= kMinBlockSize || num_threads < 2) { + ::memcpy(dst, src, n); + } else { + const char* src_ptr = static_cast(src); + char* dst_ptr = static_cast(dst); + const size_t blocksize = (n + (num_threads - 1)) / num_threads; + Barrier barrier(static_cast(num_threads - 1)); + // Launch the last 3 blocks on worker threads. + for (size_t i = 1; i < num_threads; ++i) { + enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { + ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, + numext::mini(blocksize, n - (i * blocksize))); + }); + } + // Launch the first block on the main thread. + ::memcpy(dst_ptr, src_ptr, blocksize); + barrier.Wait(); + } +#endif } EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); @@ -134,6 +126,12 @@ struct ThreadPoolDevice { return num_threads_; } + // Number of theads available in the underlying thread pool. This number can + // be different from the value returned by numThreads(). + EIGEN_STRONG_INLINE int numThreadsInPool() const { + return pool_->NumThreads(); + } + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); } @@ -149,23 +147,31 @@ struct ThreadPoolDevice { } template - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, + Args&&... args) const { Notification* n = new Notification(); - pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); + pool_->Schedule( + std::bind(&FunctionWrapperWithNotification::run, n, + std::move(f), args...)); return n; } template - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, - Function&& f, + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const { - pool_->Schedule(std::bind( - &FunctionWrapperWithBarrier::run, b, f, args...)); + pool_->Schedule( + std::bind(&FunctionWrapperWithBarrier::run, b, + std::move(f), args...)); } template - EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - pool_->Schedule(std::bind(f, args...)); + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, + Args&&... args) const { + if (sizeof...(args) > 0) { + pool_->Schedule(std::bind(std::move(f), args...)); + } else { + pool_->Schedule(std::move(f)); + } } // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if @@ -174,44 +180,189 @@ struct ThreadPoolDevice { return pool_->CurrentThreadId(); } - // parallelFor executes f with [0, n) arguments in parallel and waits for - // completion. F accepts a half-open interval [first, last). - // Block size is choosen based on the iteration cost and resulting parallel + // WARNING: This function is synchronous and will block the calling thread. + // + // Synchronous parallelFor executes f with [0, n) arguments in parallel and + // waits for completion. F accepts a half-open interval [first, last). Block + // size is chosen based on the iteration cost and resulting parallel // efficiency. If block_align is not nullptr, it is called to round up the // block size. void parallelFor(Index n, const TensorOpCost& cost, std::function block_align, std::function f) const { - typedef TensorCostModel CostModel; + if (EIGEN_PREDICT_FALSE(n <= 0)){ + return; + // Compute small problems directly in the caller thread. + } else if (n == 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block.count)); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, + Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); + lastIdx = midIdx; + } + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier.Notify(); + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + handleRange(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + } + + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + // WARNING: This function is asynchronous and will not block the calling thread. + // + // Asynchronous parallelFor executes f with [0, n) arguments in parallel + // without waiting for completion. When the last block finished, it will call + // 'done' callback. F accepts a half-open interval [first, last). Block size + // is chosen based on the iteration cost and resulting parallel efficiency. If + // block_align is not nullptr, it is called to round up the block size. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function block_align, + std::function f, + std::function done) const { + // Compute small problems directly in the caller thread. if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { f(0, n); + done(); return; } - // Calculate block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate - // parallelization overheads; not too large to mitigate tail - // effect and potential load imbalance and we also want number - // of blocks to be evenly dividable across threads. + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + ParallelForAsyncContext* const ctx = + new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule( + [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); + lastIdx = midIdx; + } + + // Single block or less, execute directly. + ctx->f(firstIdx, lastIdx); + + // Delete async context if it was the last block. + if (ctx->count.fetch_sub(1) == 1) delete ctx; + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + ctx->handle_range(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + } + } + + // Convenience wrapper for parallelForAsync that does not align blocks. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function f, + std::function done) const { + parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); + } + + // Thread pool accessor. + ThreadPoolInterface* getPool() const { return pool_; } + + // Allocator accessor. + Allocator* allocator() const { return allocator_; } - double block_size_f = 1.0 / CostModel::taskSize(1, cost); + private: + typedef TensorCostModel CostModel; + + // For parallelForAsync we must keep passed in closures on the heap, and + // delete them only after `done` callback finished. + struct ParallelForAsyncContext { + ParallelForAsyncContext(Index block_count, + std::function block_f, + std::function done_callback) + : count(block_count), + f(std::move(block_f)), + done(std::move(done_callback)) {} + ~ParallelForAsyncContext() { done(); } + + std::atomic count; + std::function f; + std::function done; + + std::function handle_range; + }; + + struct ParallelForBlock { + Index size; // block size + Index count; // number of blocks + }; + + // Calculates block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate parallelization + // overheads; not too large to mitigate tail effect and potential load + // imbalance and we also want number of blocks to be evenly dividable across + // threads. + ParallelForBlock CalculateParallelForBlock( + const Index n, const TensorOpCost& cost, + std::function block_align) const { + const double block_size_f = 1.0 / CostModel::taskSize(1, cost); const Index max_oversharding_factor = 4; Index block_size = numext::mini( - n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), - block_size_f)); + n, numext::maxi( + divup(n, max_oversharding_factor * numThreads()), + block_size_f)); const Index max_block_size = numext::mini(n, 2 * block_size); + if (block_align) { Index new_block_size = block_align(block_size); eigen_assert(new_block_size >= block_size); block_size = numext::mini(n, new_block_size); } + Index block_count = divup(n, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = static_cast(block_count) / (divup(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. for (Index prev_block_count = block_count; @@ -244,36 +395,12 @@ struct ThreadPoolDevice { } } - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast(block_count)); - std::function handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { - if (last - first <= block_size) { - // Single block or less, execute directly. - f(first, last); - barrier.Notify(); - return; - } - // Split into halves and submit to the pool. - Index mid = first + divup((last - first) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); - }; - handleRange(0, n); - barrier.Wait(); - } - - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function f) const { - parallelFor(n, cost, nullptr, std::move(f)); + return {block_size, block_count}; } - private: ThreadPoolInterface* pool_; int num_threads_; + Allocator* allocator_; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 451940de..f0f1e832 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -32,16 +32,16 @@ namespace Eigen { // Boilerplate code namespace internal { -template struct dget { - static const std::size_t value = get::value; +template struct dget { + static const std::ptrdiff_t value = get::value; }; -template +template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static inline Index run(array const& indices, + static EIGEN_STRONG_INLINE Index run(array const& indices, const Dimensions& dimensions) { return array_get(indices) + @@ -50,21 +50,21 @@ struct fixed_size_tensor_index_linearization_helper } }; -template +template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static inline Index run(array const&, const Dimensions&) + static EIGEN_STRONG_INLINE Index run(array const&, const Dimensions&) { return 0; } }; -template +template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static inline Index run(const Index index, + static EIGEN_STRONG_INLINE Index run(const Index index, const Dimensions& dimensions) { const Index mult = (index == n-1) ? 1 : 0; @@ -77,7 +77,7 @@ template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static inline Index run(const Index, + static EIGEN_STRONG_INLINE Index run(const Index, const Dimensions&) { return 0; @@ -90,9 +90,11 @@ struct fixed_size_tensor_index_extraction_helper // Fixed size #ifndef EIGEN_EMULATE_CXX11_META_H template -struct Sizes : internal::numeric_list { +struct Sizes { typedef internal::numeric_list Base; + const Base t = Base(); static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); + static const ptrdiff_t count = Base::count; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; @@ -119,17 +121,17 @@ struct Sizes : internal::numeric_list { return *this; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { - return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, t); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + ptrdiff_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + ptrdiff_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); } }; @@ -142,25 +144,25 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes +template struct non_zero_size { - typedef internal::type2val type; + typedef internal::type2val type; }; template <> struct non_zero_size<0> { typedef internal::null_type type; }; -template struct Sizes { +template struct Sizes { typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; - static const size_t count = Base::count; - static const std::size_t total_size = internal::arg_prod::value; + static const std::ptrdiff_t count = Base::count; + static const std::ptrdiff_t total_size = internal::arg_prod::value; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const { return count; } - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() { return internal::arg_prod::value; } @@ -176,7 +178,7 @@ template Sizes(DenseIndex... /*indices*/) { } - explicit Sizes(std::initializer_list) { + explicit Sizes(std::initializer_list) { // todo: add assertion } #else @@ -211,18 +213,18 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { + ptrdiff_t IndexOfColMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { + ptrdiff_t IndexOfRowMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); } }; namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { return Sizes::total_size; } } @@ -231,7 +233,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes +template struct tensor_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -243,7 +245,7 @@ struct tensor_index_linearization_helper } }; -template +template struct tensor_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -262,7 +264,7 @@ struct DSizes : array { typedef array Base; static const int count = NumDims; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumDims; } @@ -282,6 +284,57 @@ struct DSizes : array { (*this)[0] = i0; } + EIGEN_DEVICE_FUNC DSizes(const DimensionList& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + // Enable DSizes index type promotion only if we are promoting to the + // larger type, e.g. allow to promote dimensions of type int to long. + template + EIGEN_DEVICE_FUNC + explicit DSizes(const array& other, + // Default template parameters require c++11. + typename internal::enable_if< + internal::is_same< + DenseIndex, + typename internal::promote_index_type< + DenseIndex, + OtherIndex + >::type + >::value, void*>::type = 0) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = static_cast(other[i]); + } + } + +#ifdef EIGEN_HAS_INDEX_LIST + template + EIGEN_DEVICE_FUNC + explicit DSizes(const Eigen::IndexList& dimensions) { + for (int i = 0; i < dimensions.count; ++i) { + (*this)[i] = dimensions[i]; + } + } +#endif + +#ifndef EIGEN_EMULATE_CXX11_META_H + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#else + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#endif + #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { @@ -330,12 +383,21 @@ struct DSizes : array { } }; - - +template +std::ostream& operator<<(std::ostream& os, + const DSizes& dims) { + os << "["; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} // Boilerplate namespace internal { -template +template struct tensor_vsize_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -347,7 +409,7 @@ struct tensor_vsize_index_linearization_helper } }; -template +template struct tensor_vsize_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -362,10 +424,10 @@ struct tensor_vsize_index_linearization_helper namespace internal { template struct array_size > { - static const size_t value = NumDims; + static const ptrdiff_t value = NumDims; }; template struct array_size > { - static const size_t value = NumDims; + static const ptrdiff_t value = NumDims; }; #ifndef EIGEN_EMULATE_CXX11_META_H template struct array_size > { @@ -375,42 +437,42 @@ template struct array_size::count; }; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { - return get >::value; + return get >::value; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { eigen_assert(false && "should never be called"); return -1; } #else -template struct array_size > { - static const size_t value = Sizes::count; +template struct array_size > { + static const ptrdiff_t value = Sizes::count; }; -template struct array_size > { - static const size_t value = Sizes::count; +template struct array_size > { + static const ptrdiff_t value = Sizes::count; }; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { return get::Base>::value; } #endif -template +template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return false; } }; -template +template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { - return (array_get(dims1) == array_get(dims2)) & + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { + return (array_get(dims1) == array_get(dims2)) && sizes_match_below_dim::run(dims1, dims2); } }; template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return true; } }; @@ -419,7 +481,7 @@ struct sizes_match_below_dim { template -EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); } diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 06987132..a48d035f 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -32,6 +32,7 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename MakePointer_::Type PointerType; enum { Flags = 0 @@ -41,6 +42,8 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; + + }; }; @@ -73,6 +76,8 @@ class TensorEvalToOp : public TensorBase, typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; + static const int NumDims = Eigen::internal::traits::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} @@ -98,38 +103,60 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), - m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) - { } + static const int NumDims = internal::traits::NumDimensions; - // Used for accessor extraction in SYCL Managed TensorMap: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { - return m_op; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef internal::TensorBlockAssignment< + CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} + + + EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename internal::traits >::template MakePointer::Type DevicePointer; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); } +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType scalar, EvalSubExprsCallback done) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done)); + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { m_buffer[i] = m_impl.coeff(i); } @@ -137,7 +164,34 @@ struct TensorEvaluator, Device> internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + // Add `m_buffer` as destination buffer to the block descriptor. + desc.template AddDestinationBuffer( + /*dst_base=*/m_buffer + desc.offset(), + /*dst_strides=*/internal::strides(m_impl.dimensions())); + + ArgTensorBlock block = + m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); + + // If block was evaluated into a destination buffer, there is no need to do + // an assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + desc.dimensions(), internal::strides(m_impl.dimensions()), + m_buffer, desc.offset()), + block.expr()); + } + block.cleanup(); + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -159,19 +213,20 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } ArgType expression() const { return m_expression; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_buffer.bind(cgh); + } + #endif - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} private: TensorEvaluator m_impl; - const Device& m_device; - DevicePointer m_buffer; - const XprType& m_op; + EvaluatorPointerType m_buffer; const ArgType m_expression; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 834ce07d..3aff7fa0 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -32,44 +32,72 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef Derived XprType; + static const int PacketSize = PacketType::size; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get((const_cast(m.data())))), + m_dims(m.dimensions()), + m_device(device) { } - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { - if (dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + if (!NumTraits::type>::RequireInitialization && dest) { + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { + eigen_assert(m_data != NULL); return m_data[index]; } @@ -79,6 +107,18 @@ struct TensorEvaluator return internal::ploadt(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -86,7 +126,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -94,8 +134,9 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& + coeffRef(const array& coords) { + eigen_assert(m_data != NULL); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -105,19 +146,50 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); } - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(m_data != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment + TensorBlockAssign; - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(m_dims), m_data, + desc.offset()), + block.expr()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits::template MakePointer::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device EIGEN_DEVICE_REF m_device; }; namespace { @@ -126,7 +198,7 @@ T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); @@ -140,6 +212,13 @@ Eigen::half loadConstant(const Eigen::half* address) { return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif +#ifdef EIGEN_USE_SYCL +// overload of load constant should be implemented here based on range access +template +T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address) { + return *address; +} +#endif } @@ -152,40 +231,64 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef const Derived XprType; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + typedef typename internal::remove_const::type ScalarNoConst; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; + static const int PacketSize = PacketType::size; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (!NumTraits::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return loadConstant(m_data+index); } @@ -195,8 +298,20 @@ struct TensorEvaluator return internal::ploadt_ro(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); return loadConstant(m_data+index); @@ -204,19 +319,32 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); } - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits::template MakePointer::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device EIGEN_DEVICE_REF m_device; }; @@ -229,15 +357,6 @@ struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; - enum { - IsAligned = true, - PacketAccess = internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } @@ -246,13 +365,42 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess + #ifdef EIGEN_USE_SYCL + && (PacketType::size >1) + #endif + , + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + done(true); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -268,16 +416,17 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_argImpl.bind(cgh); + } +#endif private: const NullaryOp m_functor; @@ -295,32 +444,60 @@ struct TensorEvaluator, Device> typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNoConst; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef internal::TensorCwiseUnaryBlock + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } @@ -341,15 +518,31 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.getResourceRequirements().addCostPerCoeff( + {0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_argImpl.block(desc, scratch), m_functor); + } - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ + m_argImpl.bind(cgh); + } +#endif private: + const Device EIGEN_DEVICE_REF m_device; const UnaryOp m_functor; TensorEvaluator m_argImpl; }; @@ -363,16 +556,23 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -384,8 +584,27 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size< + typename TensorEvaluator::Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + LeftTensorBlock; + typedef typename TensorEvaluator::TensorBlock + RightTensorBlock; + + typedef internal::TensorCwiseBinaryBlock + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -393,12 +612,25 @@ struct TensorEvaluator + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + // TODO(ezhulenev): Evaluate two expression in parallel? + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, + [done](bool) { done(true); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); } @@ -421,15 +653,34 @@ struct TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return internal::TensorBlockResourceRequirements::merge( + m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()) + .addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + desc.DropDestinationBuffer(); + return TensorBlock(m_leftImpl.block(desc, scratch), + m_rightImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif private: + const Device EIGEN_DEVICE_REF m_device; const BinaryOp m_functor; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; @@ -444,14 +695,20 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && + internal::functor_traits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_arg1Impl(op.arg1Expression(), device), m_arg2Impl(op.arg2Expression(), device), @@ -479,8 +736,14 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -488,13 +751,13 @@ struct TensorEvaluator & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_arg1Impl.bind(cgh); + m_arg2Impl.bind(cgh); + m_arg3Impl.bind(cgh); + } +#endif private: const TernaryOp m_functor; @@ -547,15 +812,23 @@ struct TensorEvaluator typedef typename XprType::Scalar Scalar; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::packet_traits::HasBlend, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & + PacketType::HasBlend, + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) @@ -569,8 +842,42 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + IfArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock + ThenArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock + ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template + struct XprType { + typedef TensorSelectOp type; + }; + + template + typename XprType::type expr( + const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { + return typename XprType::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -578,13 +885,26 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); + }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); m_thenImpl.cleanup(); m_elseImpl.cleanup(); @@ -597,13 +917,15 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - internal::Selector select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet(index), - m_elseImpl.template packet(index)); + internal::Selector select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -613,14 +935,42 @@ struct TensorEvaluator .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& else_impl() const { return m_elseImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + auto then_req = m_thenImpl.getResourceRequirements(); + auto else_req = m_elseImpl.getResourceRequirements(); + + auto merged_req = + internal::TensorBlockResourceRequirements::merge(then_req, else_req); + merged_req.cost_per_coeff = + then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); + + return internal::TensorBlockResourceRequirements::merge( + m_condImpl.getResourceRequirements(), merged_req); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlock( + m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), + m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif private: TensorEvaluator m_condImpl; TensorEvaluator m_thenImpl; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f01d77c0..c52fb77d 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -12,31 +12,94 @@ namespace Eigen { -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tiling can use block based tensor evaluation + * (see TensorBlock.h) + */ namespace internal { -// Default strategy: the expression is evaluated with a single cpu thread. -template -class TensorExecutor -{ +/** + * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely + * expensive. If expression has at least one broadcast op in it, and it supports + * block based evaluation, we always prefer it, even for the small tensors. For + * all other tileable ops, block evaluation overhead for small tensors (fits + * into L1) is too large, and we fallback on vectorized evaluation. + */ + +// TODO(ezhulenev): Add specializations for all other types of Tensor ops. + +template +struct ExpressionHasTensorBroadcastingOp { + enum { value = false }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorAssignOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseUnaryOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseBinaryOp > { + enum { + value = ExpressionHasTensorBroadcastingOp::value || + ExpressionHasTensorBroadcastingOp::value + }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorBroadcastingOp > { + enum { value = true }; +}; + +// -------------------------------------------------------------------------- // + +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template +class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; + + // Including `unsupported/Eigen/CXX11/Tensor` in different translation units + // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR + // violation. If this template is instantiated with a non-default device, it + // means that this header file was included without defining + // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. + static_assert(std::is_same::value, + "Default executor instantiated with non-default device. " + "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " + "EIGEN_USE_SYCL before including Eigen headers."); + EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { evaluator.evalScalar(i); } } @@ -44,35 +107,48 @@ class TensorExecutor } }; - -template -class TensorExecutor -{ +/** + * Default async execution strategy is not implemented. Currently it's only + * available for ThreadPoolDevice (see definition below). + */ +template +class TensorAsyncExecutor {}; + +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template +class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; + EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { + static EIGEN_STRONG_INLINE void run( + const Expression& expr, const DefaultDevice& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + const StorageIndex UnrolledSize = + (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } - for (Index i = VectorizedSize; i < size; ++i) { + for (StorageIndex i = VectorizedSize; i < size; ++i) { evaluator.evalScalar(i); } } @@ -80,55 +156,162 @@ class TensorExecutor } }; +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template +class TensorExecutor { + public: + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; + + static const int NumDims = traits::NumDimensions; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorBlockMapper + TensorBlockMapper; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + Evaluator evaluator(expr, device); + + // TODO(ezhulenev): Do not use tiling for small tensors? + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + // Query expression tree for desired block size/shape. + const TensorBlockResourceRequirements requirements = + evaluator.getResourceRequirements(); -// Multicore strategy: the index space is partitioned and each partition is executed on a single core + const TensorBlockMapper block_mapper( + typename TensorBlockDesc::Dimensions(evaluator.dimensions()), + requirements); + + // Share scratch memory allocator between all blocks. + TensorBlockScratch scratch(device); + + const StorageIndex total_block_count = block_mapper.blockCount(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlockDesc desc = block_mapper.blockDescriptor(i); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + * + * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread + * pool, and will block the caller thread until all tasks are finished. + * + * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to + * the ThreadPoolDevice managed thread pool, and will return immediately. + * It will call 'done' callback after all tasks are finished. + */ #ifdef EIGEN_USE_THREADS -template + +template +struct TensorExecutorTilingContext { + TensorExecutorTilingContext() = default; + TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, + const TensorOpCost& b_cost, size_t b_aligned_size) + : block_mapper(b_mapper), + cost(b_cost), + aligned_blocksize(b_aligned_size) {} + + TensorBlockMapper block_mapper; // navigate through blocks + TensorOpCost cost; // cost of computing a single block + size_t aligned_blocksize; // block size after memory alignment +}; + +// Computes a block evaluation parameters, and allocates temporary memory buffer +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. +template +TensorExecutorTilingContext GetTensorExecutorTilingContext( + const Evaluator& evaluator) { + // Query expression tree for desired block size/shape. + TensorBlockResourceRequirements requirements = + evaluator.getResourceRequirements(); + + // Update target block size based on cost model. + double taskSize = TensorCostModel::taskSize( + 1, requirements.cost_per_coeff); + requirements.size = static_cast(1.0 / taskSize); + + TensorBlockMapper block_mapper( + typename TensorBlockMapper::Dimensions(evaluator.dimensions()), + requirements); + + size_t block_size = block_mapper.blockTotalSize(); + const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + const size_t aligned_blocksize = + align * + divup(block_size * sizeof(typename Evaluator::Scalar), align); + + return {block_mapper, requirements.cost_per_coeff * block_size, + aligned_blocksize}; +} + +template struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { + eigen_assert(lastIdx >= firstIdx); + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { - return size; - } + static StorageIndex alignBlockSize(StorageIndex size) { return size; } }; -template -struct EvalRange { - static const int PacketSize = unpacket_traits::size; +template +struct EvalRange { + static const int PacketSize = + unpacket_traits::size; - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - Index i = first; - if (last - first >= PacketSize) { - eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + eigen_assert(lastIdx >= firstIdx); + StorageIndex i = firstIdx; + if (lastIdx - firstIdx >= PacketSize) { + eigen_assert(firstIdx % PacketSize == 0); + StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - last_chunk_offset = last - PacketSize; + last_chunk_offset = lastIdx - PacketSize; for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { + for (; i < lastIdx; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { + static StorageIndex alignBlockSize(StorageIndex size) { // Align block size to packet size and account for unrolling in run above. if (size >= 16 * PacketSize) { return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); @@ -138,144 +321,376 @@ struct EvalRange { } }; -template -class TensorExecutor { +template +class TensorExecutor { public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { + typedef typename Expression::Index StorageIndex; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; + typedef EvalRange EvalRange; + Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { - EvalRange::run(&evaluator, first, last); + EvalRange::alignBlockSize, + [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&evaluator, firstIdx, lastIdx); }); -#else - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; - Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange::run( - &evaluator, numblocks * blocksize, size); + } + evaluator.cleanup(); + } +}; + +template +class TensorExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { + Evaluator evaluator(expr, device); + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const TilingContext tiling = + internal::GetTensorExecutorTilingContext(evaluator); + + auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, + IndexType lastBlockIdx) { + TensorBlockScratch scratch(device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; + ++block_idx) { + TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); + evaluator.evalBlock(desc, scratch); + scratch.reset(); } - barrier.Wait(); + }; + + // Evaluate small expressions directly as a single block. + if (tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(device); + TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); + evaluator.evalBlock(desc, scratch); + } else { + device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, + eval_block); } -#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) } evaluator.cleanup(); } }; -#endif // EIGEN_USE_THREADS +template +class TensorAsyncExecutor { + public: + typedef typename Expression::Index StorageIndex; + typedef TensorEvaluator Evaluator; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + DoneCallback done) { + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + typedef EvalRange EvalRange; + const StorageIndex size = array_prod(ctx->evaluator.dimensions()); + device.parallelForAsync( + size, ctx->evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); + }, + [ctx]() { delete ctx; }); + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + DoneCallback done) + : evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + Evaluator evaluator; + + private: + DoneCallback on_done; + }; +}; + +template +class TensorAsyncExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + DoneCallback done) { + + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + ctx->tiling = internal::GetTensorExecutorTilingContext< + Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); + + auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { + TensorBlockScratch scratch(ctx->device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; + ++block_idx) { + TensorBlockDesc desc = + ctx->tiling.block_mapper.blockDescriptor(block_idx); + ctx->evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (ctx->tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(ctx->device); + TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); + ctx->evaluator.evalBlock(desc, scratch); + delete ctx; + } else { + ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), + ctx->tiling.cost, eval_block, + [ctx]() { delete ctx; }); + } + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + DoneCallback done) + : device(thread_pool), + evaluator(expr, thread_pool), + on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + const ThreadPoolDevice& device; + Evaluator evaluator; + TilingContext tiling; + + private: + DoneCallback on_done; + }; +}; + +#endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template -class TensorExecutor { +template +class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); }; - -#if defined(__CUDACC__) -template +#if defined(EIGEN_GPUCC) +template struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { + for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } }; -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits::size; + const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; + for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (Index i = vectorized_size + first; i < last; i += step_size) { + for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } }; -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel(Evaluator eval, StorageIndex size) { - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); + EigenMetaKernelEval::run(eval, first_index, size, step_size); } /*static*/ -template -inline void TensorExecutor::run( +template +EIGEN_STRONG_INLINE void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); + + const int block_size = device.maxGpuThreadsPerBlock(); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; + const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - LAUNCH_CUDA_KERNEL( - (EigenMetaKernel, Index>), + LAUNCH_GPU_KERNEL( + (EigenMetaKernel, StorageIndex>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } -#endif // __CUDACC__ +#endif // EIGEN_GPUCC #endif // EIGEN_USE_GPU // SYCL Executor policy #ifdef EIGEN_USE_SYCL -template -class TensorExecutor { -public: - static inline void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); +template +struct ExecExprFunctorKernel { + typedef typename Evaluator::Index Index; + Evaluator evaluator; + const Index range; + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( + const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( + cl::sycl::nd_item<1> itemID) { + compute(itemID); + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + Index gId = static_cast(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + const Index vectorizedRange = + (range / Evaluator::PacketSize) * Evaluator::PacketSize; + Index gId = static_cast(itemID.get_global_linear_id()); + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); + } + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = + Eigen::PacketType::size; + Index vectorizable_threads = static_cast(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + + dev.template nullary_kernel_launcher< + typename Evaluator::CoeffReturnType, + ExecExprFunctorKernel >( + evaluator, + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + Index(1), range); + } + evaluator.cleanup(); } }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 85dfc7a6..c9bccfc6 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -38,7 +38,7 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 }; @@ -89,6 +89,10 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename TypeConversion::type + PointerType; }; template @@ -161,7 +165,12 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; @@ -238,7 +247,12 @@ struct traits::type _Arg3Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; @@ -314,6 +328,9 @@ struct traits > typedef typename ElseXprType::Nested ElseNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename conditional::val, + typename traits::PointerType, + typename traits::PointerType>::type PointerType; }; template diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 08eb5595..4a1a0687 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H -// This code requires the ability to initialize arrays of constant -// values directly inside a class. -#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 - namespace Eigen { /** \class TensorFFT @@ -71,6 +67,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename traits::PointerType PointerType; }; template @@ -130,17 +127,24 @@ struct TensorEvaluator, D typedef OutputScalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = true, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); @@ -165,19 +169,19 @@ struct TensorEvaluator, D return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (data) { evalToBuf(data); return false; } else { - m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); + m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); evalToBuf(m_data); return true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); m_data = NULL; @@ -200,11 +204,16 @@ struct TensorEvaluator, D return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { const bool write_to_out = internal::is_same::value; ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); @@ -230,20 +239,32 @@ struct TensorEvaluator, D // t_n = exp(sqrt(-1) * pi * n^2 / line_len) // for n = 0, 1,..., line_len-1. // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - pos_j_base_powered[0] = ComplexScalar(1, 0); - if (line_len > 1) { - const RealScalar pi_over_len(EIGEN_PI / line_len); - const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); - pos_j_base_powered[1] = pos_j_base; - if (line_len > 2) { - const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int j = 2; j < line_len + 1; ++j) { - pos_j_base_powered[j] = pos_j_base_powered[j - 1] * - pos_j_base_powered[j - 1] / - pos_j_base_powered[j - 2] * pos_j_base_sq; - } - } + + // The recurrence is correct in exact arithmetic, but causes + // numerical issues for large transforms, especially in + // single-precision floating point. + // + // pos_j_base_powered[0] = ComplexScalar(1, 0); + // if (line_len > 1) { + // const ComplexScalar pos_j_base = ComplexScalar( + // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); + // pos_j_base_powered[1] = pos_j_base; + // if (line_len > 2) { + // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + // for (int i = 2; i < line_len + 1; ++i) { + // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + // pos_j_base_powered[i - 1] / + // pos_j_base_powered[i - 2] * + // pos_j_base_sq; + // } + // } + // } + // TODO(rmlarsen): Find a way to use Eigen's vectorized sin + // and cosine functions here. + for (int j = 0; j < line_len + 1; ++j) { + double arg = ((EIGEN_PI * j) * j) / line_len; + std::complex tmp(numext::cos(arg), numext::sin(arg)); + pos_j_base_powered[j] = static_cast(tmp); } } @@ -253,7 +274,7 @@ struct TensorEvaluator, D // get data into line_buf const Index stride = m_strides[dim]; if (stride == 1) { - memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; for (int j = 0; j < line_len; ++j, offset += stride) { @@ -261,7 +282,7 @@ struct TensorEvaluator, D } } - // processs the line + // process the line if (is_power_of_two) { processDataLineCooleyTukey(line_buf, line_len, log_len); } @@ -271,7 +292,7 @@ struct TensorEvaluator, D // write back if (FFTDir == FFT_FORWARD && stride == 1) { - memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); @@ -562,12 +583,12 @@ struct TensorEvaluator, D protected: Index m_size; - const FFT& m_fft; + const FFT EIGEN_DEVICE_REF m_fft; Dimensions m_dimensions; array m_strides; TensorEvaluator m_impl; - CoeffReturnType* m_data; - const Device& m_device; + EvaluatorPointerType m_data; + const Device EIGEN_DEVICE_REF m_device; // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; @@ -645,7 +666,4 @@ struct TensorEvaluator, D } // end namespace Eigen -#endif // EIGEN_HAS_CONSTEXPR - - #endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index fcee5f60..ca39bb85 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -20,7 +20,7 @@ namespace Eigen { * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is - * Eigen::TensorFixedSize> t; + * Eigen::TensorFixedSize> t; */ template @@ -40,11 +40,18 @@ class TensorFixedSize : public TensorBase0), + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; @@ -333,27 +340,10 @@ class TensorFixedSize : public TensorBase::run(assign, DefaultDevice()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize) + protected: EIGEN_DEVICE_FUNC diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 8bece4e6..e800dedc 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -12,9 +12,16 @@ namespace Eigen { +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ namespace internal { -template class MakePointer_> -struct traits > +template +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -25,50 +32,31 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 }; - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; }; -template class MakePointer_> -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorForcedEvalOp& type; + typedef const TensorForcedEvalOp& type; }; -template class MakePointer_> -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorForcedEvalOp type; + typedef TensorForcedEvalOp type; }; } // end namespace internal -// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) - -/** \class TensorForcedEvalOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -/// `template class MakePointer_` is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler `T*` is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. -/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is `T*`. -template class MakePointer_> -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +template +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -89,49 +77,113 @@ class TensorForcedEvalOp : public TensorBase +struct non_integral_type_placement_new{ + template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) { + // Initialize non-trivially constructible types. + if (!internal::is_arithmetic::value) { + for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType(); + } +} +}; -template class MakePointer_> -struct TensorEvaluator, Device> +// SYCL does not support non-integral types +// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices +// no matching function for call to 'operator new' +template +struct non_integral_type_placement_new { + template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) { +} +}; +} // end namespace internal + +template +struct TensorEvaluator, Device> { - typedef TensorForcedEvalOp XprType; + typedef const typename internal::remove_all::type ArgType; + typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = (PacketSize > 1), - Layout = TensorEvaluator::Layout, - RawAccess = true + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + RawAccess = true }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + static const int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), + m_device(device), m_buffer(NULL) { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); - // Should initialize the memory in case we're dealing with non POD types. - if (NumTraits::RequireInitialization) { - for (Index i = 0; i < numValues; ++i) { - new(m_buffer+i) CoeffReturnType(); - } - } + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); + + internal::non_integral_type_placement_new()(numValues, m_buffer); + typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; - EvalTo evalToTmp(m_buffer, m_op); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + internal::TensorExecutor< + const EvalTo, typename internal::remove_const::type, + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>:: + run(evalToTmp, m_device); + return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate(m_buffer); + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp( + numValues * sizeof(CoeffReturnType))); + typedef TensorEvalToOp::type> + EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, + std::move(done)); + internal::TensorAsyncExecutor< + const EvalTo, typename internal::remove_const::type, + decltype(on_done), + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>:: + runAsync(evalToTmp, m_device, std::move(on_done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate_temp(m_buffer); m_buffer = NULL; } @@ -146,21 +198,37 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_buffer != NULL); + return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EvaluatorPointerType data() const { return m_buffer; } - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() { return m_impl; } - /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_buffer.bind(cgh); + m_impl.bind(cgh); + } +#endif private: TensorEvaluator m_impl; const ArgType m_op; - const Device& m_device; - typename MakePointer::Type m_buffer; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_buffer; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 52b803d7..246ebe44 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -12,7 +12,7 @@ namespace Eigen { -// MakePointer class is used as a container of the adress space of the pointer +// MakePointer class is used as a container of the address space of the pointer // on the host and on the device. From the host side it generates the T* pointer // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to // T* m_data on the host. It is always called on the device. @@ -20,8 +20,35 @@ namespace Eigen { // map_allocator. template struct MakePointer { typedef T* Type; + typedef const T* ConstType; }; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) { + return const_cast(data); +} + +// The StorageMemory class is a container of the device specific pointer +// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression +// is a device-agnostic type and need MakePointer class for type conversion, +// the TensorEvaluator class can be specialized for a device, hence it is possible +// to construct different types of temproray storage memory in TensorEvaluator +// for different devices by specializing the following StorageMemory class. +template struct StorageMemory: MakePointer {}; + +namespace internal{ +template struct Pointer_type_promotion { + static const bool val=false; +}; +template struct Pointer_type_promotion { + static const bool val = true; +}; +template struct TypeConversion { + typedef A* type; +}; +} + + template class MakePointer_ = MakePointer> class TensorMap; template class Tensor; template class TensorFixedSize; @@ -37,7 +64,7 @@ template class Ma template class TensorIndexTupleOp; template class TensorTupleReducerOp; template class TensorConcatenationOp; -template class TensorContractionOp; +template class TensorContractionOp; template class TensorConversionOp; template class TensorConvolutionOp; template class TensorFFTOp; @@ -58,21 +85,50 @@ template class TensorInflationOp; template class TensorGeneratorOp; template class TensorAssignOp; template class TensorScanOp; +template class TensorTraceOp; template class TensorCustomUnaryOp; template class TensorCustomBinaryOp; template class MakePointer_ = MakePointer> class TensorEvalToOp; -template class MakePointer_ = MakePointer> class TensorForcedEvalOp; +template class TensorForcedEvalOp; template class TensorDevice; +template class TensorAsyncDevice; template struct TensorEvaluator; +struct NoOpOutputKernel; + struct DefaultDevice; struct ThreadPoolDevice; struct GpuDevice; struct SyclDevice; +#ifdef EIGEN_USE_SYCL + +template struct MakeSYCLPointer { + typedef Eigen::TensorSycl::internal::RangeAccess Type; +}; + +template +EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess& +constCast(const Eigen::TensorSycl::internal::RangeAccess& data) { + return data; +} + +template +struct StorageMemory : MakeSYCLPointer {}; +template +struct StorageMemory : StorageMemory {}; + +namespace TensorSycl { +namespace internal{ +template class GenericNondeterministicReducer; +} +} +#endif + + enum FFTResultType { RealPart = 0, ImagPart = 1, @@ -98,10 +154,36 @@ struct IsVectorizable { TensorEvaluator::IsAligned; }; +// Tiled evaluation strategy. +enum TiledEvaluation { + Off = 0, // tiled evaluation is not supported + On = 1, // still work in progress (see TensorBlock.h) +}; + +template +struct IsTileable { + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static const bool BlockAccess = + TensorEvaluator::BlockAccess && + TensorEvaluator::PreferBlockAccess; + + static const TiledEvaluation value = + BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; +}; + template ::value> + bool Vectorizable = IsVectorizable::value, + TiledEvaluation Tiling = IsTileable::value> class TensorExecutor; +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorAsyncExecutor; + + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index d73f6dc6..d9630322 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -20,7 +20,7 @@ namespace internal { template struct scalar_mod_op { EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; } const Scalar m_divisor; }; template @@ -33,8 +33,8 @@ struct functor_traits > */ template struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } }; template struct functor_traits > @@ -42,7 +42,7 @@ struct functor_traits > template struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return numext::fmod(a, b); @@ -54,50 +54,19 @@ struct functor_traits > { PacketAccess = false }; }; - -/** \internal - * \brief Template functor to compute the sigmoid of a scalar - * \sa class CwiseUnaryOp, ArrayBase::sigmoid() - */ -template -struct scalar_sigmoid_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { - const T one = T(1); - return one / (one + numext::exp(-x)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& x) const { - const Packet one = pset1(T(1)); - return pdiv(one, padd(one, pexp(pnegate(x)))); - } -}; - -template -struct functor_traits > { - enum { - Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, - PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && - packet_traits::HasNegate && packet_traits::HasExp - }; -}; - - template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; // Standard reduction functors template struct SumReducer { - static const bool PacketAccess = packet_traits::HasAdd; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_sum_op sum_op; *accum = sum_op(*accum, t); @@ -133,16 +102,14 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd + PacketAccess = PacketType::HasAdd, + IsStateful = false, + IsExactlyAssociative = NumTraits::IsInteger }; }; - template struct MeanReducer { - static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; - static const bool IsStateful = true; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) { } @@ -166,16 +133,20 @@ template struct MeanReducer return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / scalarCount_; + internal::scalar_quotient_op quotient_op; + return quotient_op(accum, T(scalarCount_)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return pdiv(vaccum, pset1(packetCount_)); + return pdiv(vaccum, pset1(T(packetCount_))); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { internal::scalar_sum_op sum_op; - return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); + internal::scalar_quotient_op quotient_op; + return quotient_op( + sum_op(saccum, predux(vaccum)), + T(scalarCount_ + packetCount_ * unpacket_traits::size)); } protected: @@ -187,7 +158,10 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd + PacketAccess = PacketType::HasAdd && + PacketType::HasDiv && !NumTraits::IsInteger, + IsStateful = true, + IsExactlyAssociative = NumTraits::IsInteger }; }; @@ -218,20 +192,19 @@ struct MinMaxBottomValue { }; -template struct MaxReducer +template struct MaxReducer { - static const bool PacketAccess = packet_traits::HasMax; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t > *accum) { *accum = t; } + scalar_max_op op; + *accum = op(t, *accum); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmax(*accum, p); + scalar_max_op op; + (*accum) = op.packetOp(*accum, p); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); + return MinMaxBottomValue::IsInteger>::bottom_value(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -246,33 +219,34 @@ template struct MaxReducer } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::maxi(saccum, predux_max(vaccum)); + scalar_max_op op; + return op(saccum, op.predux(vaccum)); } }; -template -struct reducer_traits, Device> { +template + struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMax + PacketAccess = PacketType::HasMax, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation!=PropagateFast) }; }; - -template struct MinReducer +template struct MinReducer { - static const bool PacketAccess = packet_traits::HasMin; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t < *accum) { *accum = t; } + scalar_min_op op; + *accum = op(t, *accum); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmin(*accum, p); + scalar_min_op op; + (*accum) = op.packetOp(*accum, p); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); + return MinMaxBottomValue::IsInteger>::bottom_value(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -287,24 +261,23 @@ template struct MinReducer } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::mini(saccum, predux_min(vaccum)); + scalar_min_op op; + return op(saccum, op.predux(vaccum)); } }; -template -struct reducer_traits, Device> { +template + struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMin + PacketAccess = PacketType::HasMin, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation!=PropagateFast) }; }; - template struct ProdReducer { - static const bool PacketAccess = packet_traits::HasMul; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_product_op prod_op; (*accum) = prod_op(*accum, t); @@ -313,7 +286,6 @@ template struct ProdReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { (*accum) = pmul(*accum, p); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { internal::scalar_cast_op conv; return conv(1); @@ -340,16 +312,15 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::MulCost, - PacketAccess = PacketType::HasMul + PacketAccess = PacketType::HasMul, + IsStateful = false, + IsExactlyAssociative = true }; }; struct AndReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; } @@ -365,15 +336,14 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; struct OrReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; } @@ -389,19 +359,22 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; - -// Argmin/Argmax reducers +// Argmin/Argmax reducers. Returns the first occurrence if multiple locations +// contain the same min/max value. template struct ArgMaxTupleReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t.second > accum->second) { *accum = t; } + if (t.second < accum->second) { + return; + } else if (t.second > accum->second || accum->first > t.first ) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::lowest()); @@ -415,18 +388,21 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; template struct ArgMinTupleReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { - if (t.second < accum->second) { *accum = t; } + if (t.second > accum->second) { + return; + } else if (t.second < accum->second || accum->first > t.first) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::highest()); @@ -440,7 +416,9 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true }; }; @@ -454,6 +432,7 @@ class GaussianGenerator { const array& std_devs) : m_means(means) { + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; } @@ -461,6 +440,7 @@ class GaussianGenerator { EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { T tmp = T(0); + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; @@ -483,6 +463,25 @@ struct functor_traits > { }; }; +template +struct scalar_clamp_op { + EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar + operator()(const Scalar& x) const { + return numext::mini(numext::maxi(x, m_min), m_max); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet + packetOp(const Packet& x) const { + return internal::pmin(internal::pmax(x, pset1(m_min)), pset1(m_max)); + } + const Scalar m_min; + const Scalar m_max; +}; +template +struct functor_traits > +{ enum { Cost = 2 * NumTraits::AddCost, PacketAccess = (packet_traits::HasMin && packet_traits::HasMax)}; }; + } // end namespace internal } // end namespace Eigen diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index e27753b1..174bf068 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -87,40 +88,58 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = (internal::unpacket_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_generator(op.generator()) { - TensorEvaluator impl(op.expression(), device); - m_dimensions = impl.dimensions(); + TensorEvaluator argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } else { m_strides[NumDims - 1] = 1; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const @@ -133,7 +152,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -145,6 +164,97 @@ struct TensorEvaluator, Device> return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.firstLevelCacheSize(); + // TODO(ezhulenev): Generator should have a cost. + return internal::TensorBlockResourceRequirements::skewed( + target_size); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + static const bool is_col_major = + static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(desc.offset(), coords); + array initial_coords = coords; + + // Offset in the output block buffer. + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + // Prepare storage for the materialized generator result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + CoeffReturnType* block_buffer = block_storage.data(); + + static const int packet_size = PacketType::size; + + static const int inner_dim = is_col_major ? 0 : NumDims - 1; + const Index inner_dim_size = it[0].size; + const Index inner_dim_vectorized = inner_dim_size - packet_size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + Index i = 0; + // Generate data for the vectorized part of the inner-most dimension. + for (; i <= inner_dim_vectorized; i += packet_size) { + for (Index j = 0; j < packet_size; ++j) { + array j_coords = coords; // Break loop dependence. + j_coords[inner_dim] += j; + *(block_buffer + offset + i + j) = m_generator(j_coords); + } + coords[inner_dim] += packet_size; + } + // Finalize non-vectorized part of the inner-most dimension. + for (; i < inner_dim_size; ++i) { + *(block_buffer + offset + i) = m_generator(coords); + coords[inner_dim]++; + } + coords[inner_dim] = initial_coords[inner_dim]; + + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + + // Update offset. + for (i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = + initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make @@ -153,21 +263,26 @@ struct TensorEvaluator, Device> TensorOpCost::MulCost()); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } coords[0] = index; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } @@ -175,8 +290,10 @@ struct TensorEvaluator, Device> } } + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; array m_strides; + array m_fast_strides; Generator m_generator; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h new file mode 100644 index 00000000..cb53ce29 --- /dev/null +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -0,0 +1,99 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design +// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but +// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler +// When compiling such files, gcc will end up trying to pick up the CUDA headers by +// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) +// This will obviously not work when trying to compile tensorflow on a system with no CUDA +// To work around this issue for HIP systems (and leave the default behaviour intact), the +// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and +// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is +// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well + +#if defined(EIGEN_USE_HIP) + +#define gpuStream_t hipStream_t +#define gpuDeviceProp_t hipDeviceProp_t +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess +#define gpuErrorNotReady hipErrorNotReady +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetLastError hipGetLastError +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorName hipGetErrorName +#define gpuGetErrorString hipGetErrorString +#define gpuGetDeviceProperties hipGetDeviceProperties +#define gpuStreamDefault hipStreamDefault +#define gpuGetDevice hipGetDevice +#define gpuSetDevice hipSetDevice +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemsetAsync hipMemsetAsync +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuStreamQuery hipStreamQuery +#define gpuSharedMemConfig hipSharedMemConfig +#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig +#define gpuStreamSynchronize hipStreamSynchronize +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuMemcpy hipMemcpy + +#else + +#define gpuStream_t cudaStream_t +#define gpuDeviceProp_t cudaDeviceProp +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define gpuErrorNotReady cudaErrorNotReady +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetLastError cudaGetLastError +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorName cudaGetErrorName +#define gpuGetErrorString cudaGetErrorString +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuStreamDefault cudaStreamDefault +#define gpuGetDevice cudaGetDevice +#define gpuSetDevice cudaSetDevice +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemsetAsync cudaMemsetAsync +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuStreamQuery cudaStreamQuery +#define gpuSharedMemConfig cudaSharedMemConfig +#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig +#define gpuStreamSynchronize cudaStreamSynchronize +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuMemcpy cudaMemcpy + +#endif + +// gpu_assert can be overridden +#ifndef gpu_assert + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +// HIPCC do not support the use of assert on the GPU side. +#define gpu_assert(COND) +#else +#define gpu_assert(COND) assert(COND) +#endif + +#endif // gpu_assert + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h new file mode 100644 index 00000000..1d142f2e --- /dev/null +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -0,0 +1,44 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) + +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef gpuStream_t +#undef gpuDeviceProp_t +#undef gpuError_t +#undef gpuSuccess +#undef gpuErrorNotReady +#undef gpuGetDeviceCount +#undef gpuGetErrorString +#undef gpuGetDeviceProperties +#undef gpuStreamDefault +#undef gpuGetDevice +#undef gpuSetDevice +#undef gpuMalloc +#undef gpuFree +#undef gpuMemsetAsync +#undef gpuMemcpyAsync +#undef gpuMemcpyDeviceToDevice +#undef gpuMemcpyDeviceToHost +#undef gpuMemcpyHostToDevice +#undef gpuStreamQuery +#undef gpuSharedMemConfig +#undef gpuDeviceSetSharedMemConfig +#undef gpuStreamSynchronize +#undef gpuDeviceSynchronize +#undef gpuMemcpy + +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 566856ed..dd51850b 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -27,6 +27,7 @@ namespace Eigen { * patch_cols, and 1 for all the additional dimensions. */ namespace internal { + template struct traits > : public traits { @@ -38,6 +39,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -52,6 +54,66 @@ struct nested, 1, typename eval type; }; +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet(src_index + i); + internal::pstoret(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits::size; + const Packet padded_packet = internal::pset1(padding_value); + const Index vectorized_size = + (num_coeff_to_pad / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret(dst_data + dst_index + i, + padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + } // end namespace internal template @@ -70,12 +132,12 @@ class TensorImagePatchOp : public TensorBase, Device> typedef TensorEvaluator Impl; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) + : m_device(device), m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -238,9 +309,15 @@ struct TensorEvaluator, Device> // Calculate the padding m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + // The padding size calculation for PADDING_SAME has been updated to + // be consistent with how TensorFlow extracts its paddings. + m_rowPaddingTop = numext::maxi(0, m_rowPaddingTop); + m_colPaddingLeft = numext::maxi(0, m_colPaddingLeft); break; default: eigen_assert(false && "unexpected padding"); + m_outputCols=0; // silence the uninitialised warning; + m_outputRows=0; //// silence the uninitialised warning; } } eigen_assert(m_outputRows > 0); @@ -312,12 +389,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -418,20 +503,27 @@ struct TensorEvaluator, Device> return packetWithPossibleZero(index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -449,6 +541,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -500,6 +593,7 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; + const Device EIGEN_DEVICE_REF m_device; TensorEvaluator m_impl; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 3209fecd..2d8c7b90 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -37,36 +37,36 @@ namespace Eigen { * \sa Tensor */ -template +template struct type2index { - static const DenseIndex value = n; - EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } - EIGEN_DEVICE_FUNC void set(DenseIndex val) { + static const Index value = n; + EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; } + EIGEN_DEVICE_FUNC void set(Index val) { eigen_assert(val == n); } }; // This can be used with IndexPairList to get compile-time constant pairs, // such as IndexPairList, type2indexpair<3,4>>(). -template +template struct type2indexpair { - static const DenseIndex first = f; - static const DenseIndex second = s; + static const Index first = f; + static const Index second = s; - constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { - return IndexPair(f, s); + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { + return IndexPair(f, s); } - EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { eigen_assert(val.first == f); eigen_assert(val.second == s); } }; -template struct NumTraits > +template struct NumTraits > { - typedef DenseIndex Real; + typedef Index Real; enum { IsComplex = 0, RequireInitialization = false, @@ -75,28 +75,28 @@ template struct NumTraits > MulCost = 1 }; - EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static inline Real highest() { return n; } - EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; } }; namespace internal { template -EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { - val = new_val; +EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) { + val = internal::convert_index(new_val); } -template -EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, Index new_val) { val.set(new_val); } template -EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { val = new_val; } -template -EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { val.set(new_val); } @@ -106,36 +106,36 @@ struct is_compile_time_constant { static constexpr bool value = false; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; @@ -228,15 +228,15 @@ template -template +template struct tuple_coeff { template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple& t) { // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); } template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT& value) { + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT& value) { if (i == Idx) { update_value(array_get(t), value); } else { @@ -245,7 +245,7 @@ struct tuple_coeff { } template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { return ((i == Idx) & is_compile_time_constant::ValType>::value) || tuple_coeff::value_known_statically(i, t); } @@ -268,18 +268,18 @@ struct tuple_coeff { template struct tuple_coeff<0, ValueT> { template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple& t) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return array_get<0>(t)/* * (i == 0)*/; } template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT value) { eigen_assert (i == 0); update_value(array_get<0>(t), value); } template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { - return is_compile_time_constant::ValType>::value & (i == 0); + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value && (i == 0); } template @@ -298,32 +298,43 @@ struct tuple_coeff<0, ValueT> { template struct IndexList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const { + return internal::tuple_coeff >::value-1, Index>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { - return internal::tuple_coeff >::value-1, DenseIndex>::set(i, *this, value); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) { + return internal::tuple_coeff >::value-1, Index>::set(i, *this, value); } EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); } EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_known_statically(*this); + return internal::tuple_coeff >::value-1, Index>::values_up_to_known_statically(*this); } EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); + return internal::tuple_coeff >::value-1, Index>::values_up_to_statically_known_to_increase(*this); } }; +template +std::ostream& operator<<(std::ostream& os, + const IndexList& dims) { + os << "["; + for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} template constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { @@ -333,26 +344,28 @@ constexpr IndexList make_index_list(FirstType val1, Ot template struct IndexPairList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const Index i) const { + return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair value) { - return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair value) { + return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); } EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); } }; namespace internal { -template size_t array_prod(const IndexList& sizes) { - size_t result = 1; - for (int i = 0; i < array_size >::value; ++i) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList& sizes) { + Index result = 1; + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < array_size >::value; ++i) { result *= sizes[i]; } return result; @@ -372,30 +385,30 @@ template struct array_size >::value; }; -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList& a) { return IndexTupleExtractor::get_val(a); } -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList& a) { return IndexTupleExtractor::get_val(a); } template struct index_known_statically_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { return false; } }; template struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { return IndexList().value_known_statically(i); } }; template struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { return IndexList().value_known_statically(i); } }; @@ -447,14 +460,14 @@ template template struct index_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) == value); } @@ -462,7 +475,7 @@ struct index_statically_eq_impl > { template struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) == value); } @@ -471,14 +484,14 @@ struct index_statically_eq_impl > { template struct index_statically_ne_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) != value); } @@ -486,7 +499,7 @@ struct index_statically_ne_impl > { template struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) != value); } @@ -495,14 +508,14 @@ struct index_statically_ne_impl > { template struct index_statically_gt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) > value); } @@ -510,7 +523,7 @@ struct index_statically_gt_impl > { template struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) > value); } @@ -520,14 +533,14 @@ struct index_statically_gt_impl > { template struct index_statically_lt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) < value); } @@ -535,7 +548,7 @@ struct index_statically_lt_impl > { template struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexList().value_known_statically(i) & (IndexList().get(i) < value); } @@ -545,14 +558,14 @@ struct index_statically_lt_impl > { template struct index_pair_first_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).first == value); } @@ -560,7 +573,7 @@ struct index_pair_first_statically_eq_impl struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).first == value); } @@ -570,14 +583,14 @@ struct index_pair_first_statically_eq_impl struct index_pair_second_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } }; template struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).second == value); } @@ -585,7 +598,7 @@ struct index_pair_second_statically_eq_impl struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).second == value); } @@ -602,7 +615,7 @@ namespace internal { template struct index_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) { return false; } }; @@ -623,42 +636,42 @@ struct indices_statically_known_to_increase_impl { template struct index_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_statically_ne_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_statically_gt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_statically_lt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_pair_first_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; template struct index_pair_second_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { return false; } }; @@ -674,7 +687,7 @@ struct index_pair_second_statically_eq_impl { namespace Eigen { namespace internal { template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { return index_known_statically_impl::run(i); } @@ -689,32 +702,32 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increa } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { return index_statically_eq_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { return index_statically_ne_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { return index_statically_gt_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { return index_statically_lt_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { return index_pair_first_statically_eq_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { return index_pair_second_statically_eq_impl::run(i, value); } diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index f391fb9e..c5cb61af 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -84,18 +85,25 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) { m_dimensions = m_impl.dimensions(); @@ -129,11 +137,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -144,6 +152,7 @@ struct TensorEvaluator, Device> eigen_assert(index < dimensions().TotalSize()); *inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -158,6 +167,7 @@ struct TensorEvaluator, Device> *inputIndex += index / m_strides[0]; return true; } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -193,6 +203,7 @@ struct TensorEvaluator, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -213,7 +224,14 @@ struct TensorEvaluator, Device> compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: Dimensions m_dimensions; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h index 33edc49e..26a3818f 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -32,7 +32,7 @@ struct Initializer { Eigen::array::Index, traits::NumDimensions>* indices, const InitList& vals) { int i = 0; - for (auto v : vals) { + for (const auto& v : vals) { (*indices)[traits::NumDimensions - N] = i++; Initializer::run(tensor, indices, v); } @@ -48,7 +48,7 @@ struct Initializer { const InitList& vals) { int i = 0; // There is likely a faster way to do that than iterating. - for (auto v : vals) { + for (const auto& v : vals) { (*indices)[traits::NumDimensions - 1] = i++; tensor.coeffRef(*indices) = v; } diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ede3939c..6d5cce4a 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -21,7 +21,7 @@ namespace Eigen { * \brief Fast integer division by a constant. * * See the paper from Granlund and Montgomery for explanation. - * (at http://dx.doi.org/10.1145/773473.178249) + * (at https://doi.org/10.1145/773473.178249) * * \sa Tensor */ @@ -35,8 +35,10 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_GPU_COMPILE_PHASE return __clz(val); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC unsigned long index; _BitScanReverse(&index, val); @@ -51,8 +53,10 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_GPU_COMPILE_PHASE return __clzll(val); +#elif defined(SYCL_DEVICE_ONLY) + return static_cast(cl::sycl::clz(val)); #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 unsigned long index; _BitScanReverse64(&index, val); @@ -86,8 +90,10 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umulhi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); #else return (static_cast(a) * b) >> 32; #endif @@ -95,9 +101,11 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umul64hi(a, b); -#elif defined(__SIZEOF_INT128__) +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#elif EIGEN_HAS_BUILTIN_INT128 __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast(v >> 64); #else @@ -116,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) +#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -159,7 +167,7 @@ struct TensorIntDivisor { shift2 = log_div > 1 ? log_div-1 : 0; } - // Must have 0 <= numerator. On platforms that dont support the __uint128_t + // Must have 0 <= numerator. On platforms that don't support the __uint128_t // type numerator should also be less than 2^32-1. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); @@ -195,8 +203,10 @@ class TensorIntDivisor { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_GPU_COMPILE_PHASE return (__umulhi(magic, n) >> shift); +#elif defined(SYCL_DEVICE_ONLY) + return (cl::sycl::mul_hi(magic, static_cast(n)) >> shift); #else uint64_t v = static_cast(magic) * static_cast(n); return (static_cast(v >> 32) >> shift); diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index cd0109ef..80106c1a 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -46,6 +46,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; + typedef typename XprTraits::PointerType PointerType; }; template @@ -68,39 +69,22 @@ template class TensorLayoutSwapOp : public TensorBase, WriteAccessors> { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) - : m_xpr(expr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} EIGEN_DEVICE_FUNC const typename internal::remove_all::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp) protected: typename XprType::Nested m_xpr; }; @@ -118,12 +102,18 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { for(int i = 0; i < NumDims; ++i) { @@ -131,16 +121,25 @@ struct TensorEvaluator, Device> } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -159,7 +158,9 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } const TensorEvaluator& impl() const { return m_impl; } @@ -180,11 +181,17 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index ee0078bb..73ff3d2d 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef __CUDACC__ +#if !defined(EIGEN_GPUCC) #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC @@ -43,12 +43,56 @@ #define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ typename internal::enable_if< ( __condition__ ) , int >::type = 0 +// Define a macro to use a reference on the host but a value on the device +#if defined(SYCL_DEVICE_ONLY) + #define EIGEN_DEVICE_REF +#else + #define EIGEN_DEVICE_REF & +#endif + +// Define a macro for catching SYCL exceptions if exceptions are enabled +#define EIGEN_SYCL_TRY_CATCH(X) \ + do { \ + EIGEN_TRY {X;} \ + EIGEN_CATCH(const cl::sycl::exception& e) { \ + EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \ + std::string(__FILE__) + ":" + \ + std::to_string(__LINE__) + "\n" + \ + e.what())); \ + } \ + } while (false) -#if EIGEN_HAS_CONSTEXPR -#define EIGEN_CONSTEXPR constexpr +// Define a macro if local memory flags are unset or one of them is set +// Setting both flags is the same as unsetting them +#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ + (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#endif + +#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) + #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator =; \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; } #else -#define EIGEN_CONSTEXPR + #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) #endif +/** \internal + * \brief Macro to manually inherit assignment operators. + * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. + * This also inherits template operator=(const OtherDerived&) assignments. + * With C++11 or later this also default-implements the copy-constructor + */ +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) #endif diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index e4fc86a4..6834c97e 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -30,21 +30,39 @@ template class MakePoin { public: typedef TensorMap Self; - typedef typename PlainObjectType::Base Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; + typedef TensorBase > Base; + #ifdef EIGEN_USE_SYCL + typedef typename Eigen::internal::remove_reference::type>::type Nested; + #else + typedef typename Eigen::internal::nested::type Nested; + #endif + typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType; - /* typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar *, - const Scalar *>::type - PointerType;*/ typedef typename MakePointer_::Type PointerType; - typedef PointerType PointerArgType; + typedef typename MakePointer_::ConstType PointerConstType; + + // WARN: PointerType still can be a pointer to const (const Scalar*), for + // example in TensorMap> expression. This type of + // expression should be illegal, but adding this restriction is not possible + // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + PointerType, // use simple pointer in lvalue expressions + PointerConstType // use const pointer in rvalue expressions + >::type StoragePointerType; + + // If TensorMap was constructed over rvalue expression (e.g. const Tensor), + // we should return a reference to const from operator() (and others), even + // if TensorMap itself is not const. + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar&, + const Scalar& + >::type StorageRefType; static const int Options = Options_; @@ -59,47 +77,47 @@ template class MakePoin }; EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } @@ -116,12 +134,12 @@ template class MakePoin EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PointerType data() { return m_data; } + EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } + EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) const { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -134,14 +152,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const + EIGEN_STRONG_INLINE StorageRefType operator()() const { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -149,9 +167,10 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; @@ -162,7 +181,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -173,7 +192,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -184,7 +203,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -195,7 +214,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); @@ -208,7 +227,7 @@ template class MakePoin #endif EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -221,14 +240,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() + EIGEN_STRONG_INLINE StorageRefType operator()() { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -236,9 +255,10 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); const std::size_t NumDims = sizeof...(otherIndices) + 2; if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); @@ -250,7 +270,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -261,7 +281,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -272,7 +292,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -283,7 +303,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); @@ -295,26 +315,10 @@ template class MakePoin } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Self& operator=(const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap) private: - typename MakePointer_::Type m_data; + StoragePointerType m_data; Dimensions m_dimensions; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 615559d4..a6181d35 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,11 +52,13 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) -template <> +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) + +typedef ulonglong2 Packet4h2; +template<> struct PacketType { - typedef half2 type; - static const int size = 2; + typedef Packet4h2 type; + static const int size = 8; enum { HasAdd = 1, HasSub = 1, @@ -75,6 +77,7 @@ struct PacketType { HasSqrt = 1, HasRsqrt = 1, HasExp = 1, + HasExpm1 = 0, HasLog = 1, HasLog1p = 0, HasLog10 = 0, @@ -84,9 +87,57 @@ struct PacketType { #endif #if defined(EIGEN_USE_SYCL) -template - struct PacketType { - typedef T type; + +namespace TensorSycl { +namespace internal { + +template struct PlusOp { + static constexpr Index Value = A + B; +}; + +template struct DivOp { + static constexpr Index Value = A / B; +}; + +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { + op(start); + static_for::Value, end, step, + StepOp>::loop(op); + } +}; +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} +}; + +template +struct Vectorise { + static const int PacketSize = 1; + typedef OutScalar PacketReturnType; +}; + +template +struct Vectorise { + static const int PacketSize = Eigen::PacketType::size; + typedef typename Eigen::PacketType::type PacketReturnType; +}; + +static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { + return ((((x) + (y)-1) / (y)) * (y)); +} + +} // namespace internal +} // namespace TensorSycl + +template <> + struct PacketType { + typedef half type; static const int size = 1; enum { HasAdd = 0, @@ -103,8 +154,58 @@ template HasBlend = 0 }; }; -#endif +template +struct PacketType : internal::default_packet_traits { + typedef Scalar type; + typedef Scalar half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + HasHalfPacket = 0 + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; + +}; + +template +struct PacketType : PacketType{}; + +#ifndef EIGEN_DONT_VECTORIZE_SYCL +#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\ +template<> struct PacketType : internal::sycl_packet_traits \ +{\ + typedef typename internal::packet_traits::type type;\ + typedef typename internal::packet_traits::half half;\ +}; + + +PACKET_TYPE(const, float, 1, 4, SyclDevice) +PACKET_TYPE(, float, 1, 4, SyclDevice) +PACKET_TYPE(const, float, 1, 4, const SyclDevice) +PACKET_TYPE(, float, 1, 4, const SyclDevice) +PACKET_TYPE(const, double, 0, 2, SyclDevice) +PACKET_TYPE(, double, 0, 2, SyclDevice) +PACKET_TYPE(const, double, 0, 2, const SyclDevice) +PACKET_TYPE(, double, 0, 2, const SyclDevice) +#undef PACKET_TYPE + +template<> struct PacketType: PacketType{}; +template<> struct PacketType: PacketType{}; +#endif +#endif // Tuple mimics std::pair but works on e.g. nvcc. template struct Tuple { @@ -121,14 +222,6 @@ template struct Tuple { EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple(const U& f, const V& s) : first(f), second(s) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple& operator= (const Tuple& rhs) { - if (&rhs == this) return *this; - first = rhs.first; - second = rhs.second; - return *this; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Tuple& rhs) { using numext::swap; @@ -168,12 +261,12 @@ template struct IndexPair { #ifdef EIGEN_HAS_SFINAE namespace internal { - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx, numeric_list) { return { idx[Is]... }; } - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType&, numeric_list) { return array(); diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d34f1e32..b3f00f77 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -31,12 +31,13 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorReshapingOp& type; + typedef const TensorReshapingOpEIGEN_DEVICE_REF type; }; template @@ -53,6 +54,7 @@ template class TensorReshapingOp : public TensorBase, WriteAccessors> { public: + typedef TensorBase, WriteAccessors> Base; typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; @@ -69,24 +71,7 @@ class TensorReshapingOp : public TensorBase::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp) protected: typename XprType::Nested m_xpr; @@ -101,15 +86,63 @@ struct TensorEvaluator, Device> typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory::type, Device> ConstCastStorage; + + static const int NumOutputDims = internal::array_size::value; + static const int NumInputDims = internal::array_size::Dimensions>::value; + + enum ReshapingKind { + // We do not use layout information to determine reshaping kind. + // Depending on the layout `N` can be inner or outer dimension. + OneByN = 0, // expr.reshape(1, N) + NByOne = 1, // expr.reshape(N, 1) + Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). + }; + + // clang-format off + static const ReshapingKind kind = +#if defined(EIGEN_HAS_INDEX_LIST) + (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/0, /*value=*/1)) ? OneByN + : (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/1, /*value=*/1)) ? NByOne + : Runtime; +#else + Runtime; +#endif + // clang-format on + enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + // For trivial reshapes with raw access to underlying data we will provide + // zero overhead block access. + // TODO(ezhulenev): Consider adding block access without raw access? + BlockAccess = TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef + typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { // The total size of the reshaped tensor must be equal to the total size @@ -117,17 +150,20 @@ struct TensorEvaluator, Device> eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType data, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -146,10 +182,53 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + // required in block(OutputTensorBlock* output_block) const + // For C++03 compatibility this must be defined outside the method + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_impl.data() != NULL); + eigen_assert((kind == Runtime) || + (kind == OneByN && desc.dimensions()[0] == 1) || + (kind == NByOne && desc.dimensions()[1] == 1)); + + if (kind == OneByN || kind == NByOne) { + // We can guarantee at compile time that block is just a contiguous slice + // of the underlying expression memory buffer. + return TensorBlock(internal::TensorBlockKind::kView, + m_impl.data() + desc.offset(), desc.dimensions()); + } else { + // This will do additional runtime checks, and in the end it might be also + // a view, or it might be a block materialized in the temporary buffer. + return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, + scratch); + } + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif protected: TensorEvaluator m_impl; NewDimensions m_dimensions; @@ -167,14 +246,16 @@ template typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -183,15 +264,38 @@ template typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(index); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { this->m_impl.template writePacket(index, x); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment< + Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index> + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(this->dimensions()), + this->m_impl.data(), desc.offset()), + block.expr()); + } }; @@ -214,12 +318,13 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorSlicingOp& type; + typedef const TensorSlicingOpEIGEN_DEVICE_REF type; }; template @@ -236,6 +341,7 @@ template class TensorSlicingOp : public TensorBase > { public: + typedef TensorBase > Base; typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; @@ -254,25 +360,7 @@ class TensorSlicingOp : public TensorBase::type& expression() const { return m_xpr; } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp) protected: typename XprType::Nested m_xpr; @@ -283,9 +371,12 @@ class TensorSlicingOp : public TensorBase struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } + EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { + const bool prefer_block_evaluation = BlockAccess && total > 32*1024; + return !prefer_block_evaluation && contiguous > threshold_; + } private: Index threshold_; @@ -294,11 +385,21 @@ template struct MemcpyTriggerForSlicing { // It is very expensive to start the memcpy kernel on GPU: we therefore only // use it for large copies. #ifdef EIGEN_USE_GPU -template struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } }; #endif + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_SYCL +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } +}; +#endif + } // Eval as rvalue @@ -308,23 +409,56 @@ struct TensorEvaluator, Devi typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + typedef StorageMemory Storage; + typedef StorageMemory::type, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; + enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess && + // FIXME: Temporary workaround for bug in slicing of bool tensors. + !internal::is_same::type, bool>::value, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + // Tensor slicing does not change the block type. + typedef typename TensorEvaluator::TensorBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { - for (std::size_t i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= + op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || + op.startIndices()[i] != 0) { + m_is_identity = false; + } } + // No strides for scalars. + if (NumDims == 0) return; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); if (static_cast(Layout) == static_cast(ColMajor)) { @@ -337,7 +471,7 @@ struct TensorEvaluator, Devi m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } else { m_inputStrides[NumDims-1] = 1; @@ -349,23 +483,17 @@ struct TensorEvaluator, Devi m_outputStrides[NumDims-1] = 1; for (int i = NumDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); - if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { + if (!NumTraits::type>::RequireInitialization + && data && m_impl.data()) { Index contiguous_values = 1; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { @@ -383,12 +511,12 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); - if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); - for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + const MemcpyTriggerForSlicing trigger(m_device); + if (trigger(internal::array_prod(dimensions()), contiguous_values)) { + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); + for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); } return false; } @@ -396,25 +524,42 @@ struct TensorEvaluator, Devi return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + if (m_is_identity) { + return m_impl.template packet(index); + } + Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -426,6 +571,7 @@ struct TensorEvaluator, Devi inputIndices[0] += (indices[0] + m_offsets[0]); inputIndices[1] += (indices[1] + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -445,6 +591,7 @@ struct TensorEvaluator, Devi EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { values[i] = coeff(index+i); } @@ -454,12 +601,28 @@ struct TensorEvaluator, Devi } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); + TensorBlock block = m_impl.block(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + return block; + } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { - Scalar* result = m_impl.data(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (result) { Index offset = 0; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -493,12 +656,19 @@ struct TensorEvaluator, Devi } return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -506,6 +676,7 @@ struct TensorEvaluator, Devi } inputIndex += (index + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -520,8 +691,9 @@ struct TensorEvaluator, Devi array, NumDims> m_fastOutputStrides; array m_inputStrides; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; + bool m_is_identity; const StartIndices m_offsets; }; @@ -535,36 +707,55 @@ struct TensorEvaluator, Device> typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; - enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return this->m_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + + const int packetSize = PacketType::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -576,6 +767,7 @@ struct TensorEvaluator, Device> inputIndices[0] += (indices[0] + this->m_offsets[0]); inputIndices[1] += (indices[1] + this->m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -595,14 +787,20 @@ struct TensorEvaluator, Device> internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { this->coeffRef(index+i) = values[i]; } } } -}; - + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); + this->m_impl.writeBlock(arg_desc, block); + } +}; namespace internal { template @@ -616,12 +814,13 @@ struct traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorStridingSlicingOp& type; + typedef const TensorStridingSlicingOpEIGEN_DEVICE_REF type; }; template @@ -637,6 +836,7 @@ template > { public: + typedef TensorBase > Base; typedef typename internal::traits::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename internal::nested::type Nested; @@ -660,26 +860,7 @@ class TensorStridingSlicingOp : public TensorBase::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp) protected: typename XprType::Nested m_xpr; @@ -694,6 +875,13 @@ struct TensorEvaluator XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -701,43 +889,58 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_device(device), + m_strides(op.strides()) { // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero - DSizes startIndicesClamped, stopIndicesClamped; - for (size_t i = 0; i < internal::array_size::value; ++i) { + DSizes startIndicesClamped, stopIndicesClamped; + for (ptrdiff_t i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); - if(m_strides[i]>0){ - startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - }else{ - /* implies m_strides[i]<0 by assert */ - startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + if (m_strides[i] > 0) { + startIndicesClamped[i] = + clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = + clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + } else { + /* implies m_strides[i] < 0 by assert */ + startIndicesClamped[i] = + clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = + clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); } m_startIndices[i] = startIndicesClamped[i]; } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + typedef typename TensorEvaluator::Dimensions InputDimensions; + const InputDimensions& input_dims = m_impl.dimensions(); - // check for degenerate intervals and compute output tensor shape - bool degenerate = false;; - for(int i = 0; i < NumDims; i++){ + // compute output tensor shape + m_is_identity = true; + for (int i = 0; i < NumDims; i++) { Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; - if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ + if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { m_dimensions[i] = 0; - degenerate = true; - }else{ - m_dimensions[i] = interval / m_strides[i] - + (interval % m_strides[i] != 0 ? 1 : 0); + } else { + m_dimensions[i] = + (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); eigen_assert(m_dimensions[i] >= 0); } + if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) { + m_is_identity = false; + } } + Strides output_dims = m_dimensions; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -754,8 +957,7 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } else { m_inputStrides[NumDims-1] = m_strides[NumDims-1]; @@ -770,58 +972,58 @@ struct TensorEvaluator= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } - m_block_total_size_max = numext::maxi(static_cast(1), - device.lastLevelCacheSize() / - sizeof(Scalar)); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } - +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; index -= idx * m_outputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; @@ -831,20 +1033,24 @@ struct TensorEvaluator m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; + bool m_is_identity; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; DSizes m_startIndices; // clamped startIndices DSizes m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; - std::size_t m_block_total_size_max; }; // Eval as lvalue @@ -860,25 +1066,33 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Strides Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return this->m_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 647bcf10..ee44382c 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -31,6 +31,7 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -90,18 +91,33 @@ struct TensorEvaluator, Device typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector @@ -135,11 +151,20 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -148,6 +173,7 @@ struct TensorEvaluator, Device eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (isPaddingAtIndexForDim(idx, i)) { @@ -161,6 +187,7 @@ struct TensorEvaluator, Device } inputIndex += (index - m_padding[0].first); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (isPaddingAtIndexForDim(idx, i)) { @@ -189,18 +216,298 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { TensorOpCost cost = m_impl.costPerCoeff(vectorized); if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0); } else { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1); } return cost; } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // If one of the dimensions is zero, return empty block view. + if (desc.size() == 0) { + return TensorBlock(internal::TensorBlockKind::kView, NULL, + desc.dimensions()); + } + + static const bool IsColMajor = Layout == static_cast(ColMajor); + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[inner_dim_idx] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes output_strides = + internal::strides(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims - 1; ++i) { + const int dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const Index input_inner_dim_size = + static_cast(m_impl.dimensions()[inner_dim_idx]); + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), + output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + + output_inner_pad_before_size), + Index(0))); + + eigen_assert(output_inner_copy_size >= 0); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - + output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes output_coord = output_offsets; + DSizes output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy LinCopy; + + // Prepare storage for the materialized padding result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a + // single logical inner dimension. + + // When possible we squeeze writes for the innermost (only if non-padded) + // dimension with the first padded dimension. This allows to reduce the + // number of calls to LinCopy and better utilize vector instructions. + const bool squeeze_writes = + NumDims > 1 && + // inner dimension is not padded + (input_inner_dim_size == m_dimensions[inner_dim_idx]) && + // and equal to the block inner dimension + (input_inner_dim_size == output_inner_dim_size); + + const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; + + // Maximum coordinate on a squeeze dimension that we can write to. + const Index squeeze_max_coord = + squeeze_writes ? numext::mini( + // max non-padded element in the input + static_cast(m_dimensions[squeeze_dim] - + m_padding[squeeze_dim].second), + // max element in the output buffer + static_cast(output_offsets[squeeze_dim] + + desc.dimension(squeeze_dim))) + : static_cast(0); + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size;) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill single innermost dimension with padding value. + size += output_inner_dim_size; + + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + + } else if (squeeze_writes) { + // Squeeze multiple reads from innermost dimensions. + const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; + size += output_inner_dim_size * squeeze_num; + + // Copy `squeeze_num` inner dimensions from input to output. + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(input_offset, 1, m_impl.data()), + output_inner_dim_size * squeeze_num); + + // Update iteration state for only `squeeze_num - 1` processed inner + // dimensions, because we have another iteration state update at the end + // of the loop that will update iteration state for the last inner + // processed dimension. + it[0].count += (squeeze_num - 1); + input_offset += it[0].input_stride * (squeeze_num - 1); + output_offset += it[0].output_stride * (squeeze_num - 1); + output_coord[squeeze_dim] += (squeeze_num - 1); + + } else { + // Single read from innermost dimension. + size += output_inner_dim_size; + + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + + output_inner_copy_size; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), + size(0), + input_stride(0), + input_span(0), + output_stride(0), + output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { #if defined(EIGEN_HAS_INDEX_LIST) @@ -262,22 +569,23 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { - const Index first = index; - const Index last = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -289,21 +597,21 @@ struct TensorEvaluator, Device } } - const Index last = index + PacketSize - 1; - const Index first = index; + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); const Index lastPaddedRight = m_outputStrides[1]; - if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[0].first); return m_impl.template packet(inputIndex); @@ -319,23 +627,23 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; - + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { - const Index first = index; - const Index last = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i+1]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -347,21 +655,21 @@ struct TensorEvaluator, Device } } - const Index last = index + PacketSize - 1; - const Index first = index; + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); const Index lastPaddedRight = m_outputStrides[NumDims-1]; - if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[NumDims-1].first); return m_impl.template packet(inputIndex); @@ -373,6 +681,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -387,6 +696,8 @@ struct TensorEvaluator, Device PaddingDimensions m_padding; Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 886a254f..413d25dd 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -87,18 +88,26 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { Index num_patches = 1; @@ -143,12 +152,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -161,6 +170,7 @@ struct TensorEvaluator, Device> Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -169,6 +179,7 @@ struct TensorEvaluator, Device> inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -196,6 +207,7 @@ struct TensorEvaluator, Device> Index inputIndices[2] = {0, 0}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -211,6 +223,7 @@ struct TensorEvaluator, Device> inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -237,6 +250,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } @@ -253,7 +267,14 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: Dimensions m_dimensions; @@ -262,6 +283,7 @@ struct TensorEvaluator, Device> array m_patchStrides; TensorEvaluator m_impl; + }; } // end namespace Eigen diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 1655a813..37c1d1c3 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2018 Mehdi Goli Codeplay Software Ltd. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -16,50 +17,23 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. - assert(threadIdx.z == 0); - return clock64() + - blockIdx.x * blockDim.x + threadIdx.x + - gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); - -#elif defined _WIN32 - // Use the current time as a baseline. - SYSTEMTIME st; - GetSystemTime(&st); - int time = st.wSecond + 1000 * st.wMilliseconds; - // Mix in a random number to make sure that we get different seeds if - // we try to generate seeds faster than the clock resolution. - // We need 2 random values since the generator only generate 16 bits at - // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; - return rnd; - -#elif defined __APPLE__ - // Same approach as for win32, except that the random number generator - // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). - uint64_t rnd = ::random() ^ mach_absolute_time(); - return rnd; - + gpu_assert(threadIdx.z == 0); + return blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); #else - // Augment the current time with pseudo random number generation - // to ensure that we get different seeds if we try to generate seeds - // faster than the clock resolution. - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - uint64_t rnd = ::random() ^ ts.tv_nsec; - return rnd; + // Rely on Eigen's random implementation. + return random(); #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + *state = current * 6364136223846793005ULL + (stream << 1 | 1); // Generate the random output (using the PCG-XSH-RS scheme) return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); } @@ -73,34 +47,42 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state) { - unsigned rnd = PCG_XSH_RS_generator(state); +T RandomToTypeUniform(uint64_t* state, uint64_t stream) { + unsigned rnd = PCG_XSH_RS_generator(state, stream); return static_cast(rnd); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform(uint64_t* state) { - Eigen::half result; - // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state); - result.x = static_cast(rnd & 0x3ffu); - // Set the exponent - result.x |= (static_cast(15) << 10); +Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { + // Generate 10 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x3ffu) | (static_cast(15) << 10); + Eigen::half result = Eigen::numext::bit_cast(half_bits); // Return the final result return result - Eigen::half(1.0f); } +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Eigen::bfloat16 RandomToTypeUniform(uint64_t* state, uint64_t stream) { + + // Generate 7 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x7fu) | (static_cast(127) << 7); + Eigen::bfloat16 result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::bfloat16(1.0f); +} template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform(uint64_t* state) { +float RandomToTypeUniform(uint64_t* state, uint64_t stream) { typedef union { uint32_t raw; float fp; } internal; internal result; // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state); + const unsigned rnd = PCG_XSH_RS_generator(state, stream); result.raw = rnd & 0x7fffffu; // Set the exponent result.raw |= (static_cast(127) << 23); @@ -109,7 +91,7 @@ float RandomToTypeUniform(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform(uint64_t* state) { +double RandomToTypeUniform(uint64_t* state, uint64_t stream) { typedef union { uint64_t raw; double dp; @@ -118,9 +100,9 @@ double RandomToTypeUniform(uint64_t* state) { result.raw = 0; // Generate 52 random bits for the mantissa // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; + unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state); + unsigned rnd2 = PCG_XSH_RS_generator(state, stream); result.raw = (static_cast(rnd1) << 32) | rnd2; // Set the exponent result.raw |= (static_cast(1023) << 52); @@ -129,14 +111,14 @@ double RandomToTypeUniform(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); } template class UniformRandomGenerator { @@ -147,17 +129,42 @@ template class UniformRandomGenerator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two step to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread + // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds + // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction + // similar to CUDA Therefore, the thread Id injection is not available at this stage. + //However when the operator() is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( const UniformRandomGenerator& other) { m_state = other.m_state; + #ifdef EIGEN_USE_SYCL + m_exec_once =other.m_exec_once; + #endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeUniform(&local_state); - m_state = local_state; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + T result = RandomToTypeUniform(&m_state, i); return result; } @@ -165,16 +172,25 @@ template class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform(&local_state); + values[j] = RandomToTypeUniform(&m_state, i); } - m_state = local_state; return internal::pload(values); } private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; template @@ -190,14 +206,14 @@ struct functor_traits > { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state) { +T RandomToTypeNormal(uint64_t* state, uint64_t stream) { // Use the ratio of uniform method to generate numbers following a normal // distribution. See for example Numerical Recipes chapter 7.3.9 for the // details. T u, v, q; do { - u = RandomToTypeUniform(state); - v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); + u = RandomToTypeUniform(state, stream); + v = T(1.7156) * (RandomToTypeUniform(state, stream) - T(0.5)); const T x = u - T(0.449871); const T y = numext::abs(v) + T(0.386595); q = x*x + y * (T(0.196)*y - T(0.25472)*x); @@ -208,14 +224,14 @@ T RandomToTypeNormal(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); } @@ -226,17 +242,38 @@ template class NormalRandomGenerator { // Uses the given "seed" if non-zero, otherwise uses a random seed. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Therefore, the thread Id injection is not available at this stage. However when the operator() + //is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( const NormalRandomGenerator& other) { m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once=other.m_exec_once; +#endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeNormal(&local_state); - m_state = local_state; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + T result = RandomToTypeNormal(&m_state, i); return result; } @@ -244,16 +281,25 @@ template class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal(&local_state); + values[j] = RandomToTypeNormal(&m_state, i); } - m_state = local_state; return internal::pload(values); } private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 41d0d002..583f4625 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,8 +11,20 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) +#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 +#else +#define KERNEL_FRIEND friend +#endif +#endif + + namespace Eigen { + /** \class TensorReduction * \ingroup CXX11_Tensor_Module * @@ -32,6 +44,7 @@ namespace internal { typedef typename XprType::Nested Nested; static const int NumDimensions = XprTraits::NumDimensions - array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; template struct MakePointer { // Intermediate typedef to workaround MSVC issue. @@ -152,7 +165,9 @@ struct GenericDimReducer<-1, Self, Op> { } }; -template +template struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { typename Self::CoeffReturnType accum = reducer.initialize(); @@ -164,23 +179,100 @@ struct InnerMostDimReducer { }; template -struct InnerMostDimReducer { +struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const int packetSize = internal::unpacket_traits::size; + const typename Self::Index packetSize = internal::unpacket_traits::size; const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType p = reducer.template initializePacket(); + typename Self::PacketReturnType paccum = reducer.template initializePacket(); for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum); } typename Self::CoeffReturnType accum = reducer.initialize(); for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); } - return reducer.finalizeBoth(accum, p); + return reducer.finalizeBoth(accum, paccum); } }; -template +#if !defined(EIGEN_HIPCC) +static const int kLeafSize = 1024; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > kLeafSize) { + const typename Self::Index half = numValuesToReduce / 2; + reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); + reducer.reduce( + reduce(self, firstIndex + half, numValuesToReduce - half, reducer), + &accum); + } else { + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + const typename Self::Index packetSize = + internal::unpacket_traits::size; + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > packetSize * kLeafSize) { + // Make sure the split point is aligned on a packet boundary. + const typename Self::Index split = + packetSize * + divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)), + packetSize); + const typename Self::Index num_left = + numext::mini(split - firstIndex, numValuesToReduce); + reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); + if (num_left < numValuesToReduce) { + reducer.reduce( + reduce(self, split, numValuesToReduce - num_left, reducer), &accum); + } + return reducer.finalize(accum); + } else { + const typename Self::Index UnrollSize = + (numValuesToReduce / (2*packetSize)) * 2*packetSize; + const typename Self::Index VectorizedSize = + (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType paccum = + reducer.template initializePacket(); + typename Self::PacketReturnType paccum2 = + reducer.template initializePacket(); + for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) { + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j), &paccum); + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j + packetSize), + &paccum2); + } + for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) { + reducer.reducePacket(self.m_impl.template packet( + firstIndex + j), &paccum); + } + reducer.reducePacket(paccum2, &paccum); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; + ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, paccum); + } + } +}; +#endif + +template struct InnerMostDimPreserver { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { eigen_assert(false && "should never be called"); @@ -215,11 +307,11 @@ struct InnerMostDimPreserver<-1, Self, Op, true> { }; // Default full reducer -template +template struct FullReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); } @@ -229,7 +321,7 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template + bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, @@ -242,8 +334,8 @@ struct FullReducerShard { // Multithreaded full reducer template struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; + static const Index PacketSize = unpacket_traits::size; // launch one reducer per thread and accumulate the result. @@ -320,29 +412,58 @@ struct OuterReducer { } }; +#ifdef EIGEN_USE_SYCL +// Default Generic reducer +template +struct GenericReducer { + static const bool HasOptimizedImplementation = false; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; +#endif + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 -template -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); -template -__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); -template -__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); +#if defined(EIGEN_HAS_GPU_FP16) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif -template -__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); -template -__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif +/** + * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. + * This allows the reduction to have a different type for the accumulator than the input data type. + * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input + * with the accumulator and the other for reducing two accumulators. + * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for + * some properties of the input. + */ +template +struct ReductionReturnType { +#if defined(EIGEN_USE_SYCL) + typedef typename remove_const().initialize())>::type type; +#else + typedef typename remove_const::type type; +#endif +}; + } // end namespace internal @@ -376,11 +497,15 @@ class TensorReductionOp : public TensorBase +struct TensorReductionEvaluatorBase; // Eval as rvalue template class MakePointer_, typename Device> -struct TensorEvaluator, Device> +struct TensorReductionEvaluatorBase, Device> { + typedef internal::reducer_traits ReducerTraits; + typedef Dims ReducedDims; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -390,26 +515,42 @@ struct TensorEvaluator, static const int NumOutputDims = NumInputDims - NumReducedDims; typedef typename internal::conditional, DSizes >::type Dimensions; typedef typename XprType::Scalar Scalar; - typedef TensorEvaluator, Device> Self; + typedef TensorReductionEvaluatorBase, Device> Self; static const bool InputPacketAccess = TensorEvaluator::PacketAccess; - typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::ReductionReturnType::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const Index PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1::size; enum { IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -434,11 +575,13 @@ struct TensorEvaluator, m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides.back() = 1; + m_outputStrides[NumOutputDims - 1] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -466,6 +609,7 @@ struct TensorEvaluator, ++reduceIndex; } else { m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; ++outputIndex; } } @@ -475,13 +619,19 @@ struct TensorEvaluator, if (NumOutputDims == 0) { m_preservedStrides[0] = internal::array_prod(input_dims); } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { - m_impl.evalSubExprsIfNeeded(NULL); - + EIGEN_STRONG_INLINE + bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { // Use the FullReducer if possible. if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && @@ -489,7 +639,7 @@ struct TensorEvaluator, !RunningOnGPU))) { bool need_assign = false; if (!data) { - m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); + m_result = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); data = m_result; need_assign = true; } @@ -497,20 +647,9 @@ struct TensorEvaluator, internal::FullReducer::run(*this, reducer, m_device, data); return need_assign; } - else if(RunningOnSycl){ - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } // Attempt to use an optimized reduction. - else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { + else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { @@ -524,8 +663,8 @@ struct TensorEvaluator, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -533,9 +672,10 @@ struct TensorEvaluator, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } return true; @@ -557,8 +697,8 @@ struct TensorEvaluator, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -566,9 +706,10 @@ struct TensorEvaluator, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } return true; @@ -576,21 +717,54 @@ struct TensorEvaluator, return (m_result != NULL); } } + #if defined(EIGEN_USE_SYCL) + // If there is no Optimised version for SYCL, the reduction expression + // must break into two subexpression and use the SYCL generic Reducer on the device. + if(RunningOnSycl) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + Op reducer(m_reducer); + internal::GenericReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } + #endif } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE + void + evalSubExprsIfNeededAsync(EvaluatorPointerType data, + EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { + done(evalSubExprsIfNeededCommon(data)); + }); + } +#endif + + EIGEN_STRONG_INLINE + bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + return evalSubExprsIfNeededCommon(data); + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { + if (( RunningFullReduction || RunningOnGPU) && m_result ) { return *(m_result + index); } Op reducer(m_reducer); @@ -662,37 +836,52 @@ struct TensorEvaluator, } } - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_result.bind(cgh); + } +#endif private: template friend struct internal::GenericDimReducer; - template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimReducer; template friend struct internal::InnerMostDimPreserver; template friend struct internal::FullReducer; #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 - template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +#if defined(EIGEN_HAS_GPU_FP16) + template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); + template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); + template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif - template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); - template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif +#if defined(EIGEN_USE_SYCL) + template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; + // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer + template friend struct internal::GenericReducer; +#endif + + template friend struct internal::InnerReducer; + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -741,10 +930,12 @@ struct TensorEvaluator, Dimensions m_dimensions; // Precomputed strides for the output tensor. array m_outputStrides; - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1::size; + array, NumOutputDims> m_fastOutputStrides; array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -760,7 +951,7 @@ struct TensorEvaluator, Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) @@ -770,10 +961,36 @@ static const bool RunningOnGPU = false; static const bool RunningOnGPU = false; static const bool RunningOnSycl = false; #endif - typename MakePointer_::Type m_result; + EvaluatorPointerType m_result; - const Device& m_device; - const Dims& m_xpr_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> +: public TensorReductionEvaluatorBase, Device> { + typedef TensorReductionEvaluatorBase, Device> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} +}; + + +template class MakePointer_> +struct TensorEvaluator, Eigen::SyclDevice> +: public TensorReductionEvaluatorBase, Eigen::SyclDevice> { + + typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} + // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the coeff function should be overridden by for SYCL kernel + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { + return *(this->data() + index); + } + // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the packet function should be overridden by for SYCL kernel + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { + return internal::pload(this->data() + index); + } }; } // end namespace Eigen diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 65638b6a..68780cd3 100644 --- a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -1,750 +1,6 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H - -namespace Eigen { -namespace internal { - - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -// Full reducers for GPU, don't vectorize for now - -// Reducer function that enables multiple cuda thread to safely accumulate at the same -// output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another cuda thread -// updated the content of the output address it will try again. -template -__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if __CUDA_ARCH__ >= 300 - if (sizeof(T) == 4) - { - unsigned int oldval = *reinterpret_cast(output); - unsigned int newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned int readback; - while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else if (sizeof(T) == 8) { - unsigned long long oldval = *reinterpret_cast(output); - unsigned long long newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned long long readback; - while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else { - assert(0 && "Wordsize not supported"); - } -#else - assert(0 && "Shouldn't be called on unsupported device"); -#endif -} - -// We extend atomicExch to support extra data types -template -__device__ inline Type atomicExchCustom(Type* address, Type val) { - return atomicExch(address, val); -} - -template <> -__device__ inline double atomicExchCustom(double* address, double val) { - unsigned long long int* address_as_ull = reinterpret_cast(address); - return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); -} - -#ifdef EIGEN_HAS_CUDA_FP16 -template