|
| 1 | +// This file is part of OpenCV project. |
| 2 | +// It is subject to the license terms in the LICENSE file found in the top-level directory |
| 3 | +// of this distribution and at http://opencv.org/license.html. |
| 4 | + |
| 5 | +#if !defined CUDA_DISABLER |
| 6 | + |
| 7 | +#include <opencv2/core/cuda/common.hpp> |
| 8 | +#include <opencv2/cudev/util/atomic.hpp> |
| 9 | +#include "moments.cuh" |
| 10 | + |
| 11 | +namespace cv { namespace cuda { namespace device { namespace imgproc { |
| 12 | + |
| 13 | +constexpr int blockSizeX = 32; |
| 14 | +constexpr int blockSizeY = 16; |
| 15 | + |
| 16 | +template <typename T> |
| 17 | +__device__ T butterflyWarpReduction(T value) { |
| 18 | + for (int i = 16; i >= 1; i /= 2) |
| 19 | + value += __shfl_xor_sync(0xffffffff, value, i, 32); |
| 20 | + return value; |
| 21 | +} |
| 22 | + |
| 23 | +template <typename T> |
| 24 | +__device__ T butterflyHalfWarpReduction(T value) { |
| 25 | + for (int i = 8; i >= 1; i /= 2) |
| 26 | + value += __shfl_xor_sync(0xffff, value, i, 32); |
| 27 | + return value; |
| 28 | +} |
| 29 | + |
| 30 | +template<typename T, int nMoments> |
| 31 | +__device__ void updateSums(const T val, const unsigned int x, T r[4]) { |
| 32 | + const T x2 = x * x; |
| 33 | + const T x3 = static_cast<T>(x) * x2; |
| 34 | + r[0] += val; |
| 35 | + r[1] += val * x; |
| 36 | + if (nMoments >= n12) r[2] += val * x2; |
| 37 | + if (nMoments >= n123) r[3] += val * x3; |
| 38 | +} |
| 39 | + |
| 40 | +template<typename TSrc, typename TMoments, int nMoments> |
| 41 | +__device__ void rowReductions(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], TMoments smem[][nMoments + 1]) { |
| 42 | + for (int x = threadIdx.x; x < img.cols; x += blockDim.x) { |
| 43 | + const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1; |
| 44 | + updateSums<TMoments,nMoments>(val, x, r); |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +template<typename TSrc, typename TMoments, bool fourByteAligned, int nMoments> |
| 49 | +__device__ void rowReductionsCoalesced(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], const int offsetX, TMoments smem[][nMoments + 1]) { |
| 50 | + const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX; |
| 51 | + // load uncoalesced head |
| 52 | + if (!fourByteAligned && threadIdx.x == 0) { |
| 53 | + for (int x = 0; x < ::min(alignedOffset, static_cast<int>(img.cols)); x++) { |
| 54 | + const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1; |
| 55 | + updateSums<TMoments, nMoments>(val, x, r); |
| 56 | + } |
| 57 | + } |
| 58 | + |
| 59 | + // coalesced loads |
| 60 | + const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? img.ptr(y) : img.ptr(y) + alignedOffset); |
| 61 | + const int cols4 = fourByteAligned ? img.cols / 4 : (img.cols - alignedOffset) / 4; |
| 62 | + for (int x = threadIdx.x; x < cols4; x += blockDim.x) { |
| 63 | + const unsigned int data = rowPtrIntAligned[x]; |
| 64 | +#pragma unroll 4 |
| 65 | + for (int i = 0; i < 4; i++) { |
| 66 | + const int iX = alignedOffset + 4 * x + i; |
| 67 | + const uchar ucharVal = ((data >> i * 8) & 0xFFU); |
| 68 | + const TMoments val = (!binary || ucharVal == 0) ? ucharVal : 1; |
| 69 | + updateSums<TMoments, nMoments>(val, iX, r); |
| 70 | + } |
| 71 | + } |
| 72 | + |
| 73 | + // load uncoalesced tail |
| 74 | + if (threadIdx.x == 0) { |
| 75 | + const int iTailStart = fourByteAligned ? cols4 * 4 : cols4 * 4 + alignedOffset; |
| 76 | + for (int x = iTailStart; x < img.cols; x++) { |
| 77 | + const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1; |
| 78 | + updateSums<TMoments, nMoments>(val, x, r); |
| 79 | + } |
| 80 | + } |
| 81 | +} |
| 82 | + |
| 83 | +template <typename TSrc, typename TMoments, bool coalesced = false, bool fourByteAligned = false, int nMoments> |
| 84 | +__global__ void spatialMoments(const PtrStepSz<TSrc> img, const bool binary, TMoments* moments, const int offsetX = 0) { |
| 85 | + const unsigned int y = blockIdx.x * blockDim.y + threadIdx.y; |
| 86 | + __shared__ TMoments smem[blockSizeY][nMoments + 1]; |
| 87 | + if (threadIdx.y < nMoments && threadIdx.x < blockSizeY) |
| 88 | + smem[threadIdx.x][threadIdx.y] = 0; |
| 89 | + __syncthreads(); |
| 90 | + |
| 91 | + TMoments r[4] = { 0 }; |
| 92 | + if (y < img.rows) { |
| 93 | + if (coalesced) |
| 94 | + rowReductionsCoalesced<TSrc, TMoments, fourByteAligned, nMoments>(img, binary, y, r, offsetX, smem); |
| 95 | + else |
| 96 | + rowReductions<TSrc, TMoments, nMoments>(img, binary, y, r, smem); |
| 97 | + } |
| 98 | + |
| 99 | + const unsigned long y2 = y * y; |
| 100 | + const TMoments y3 = static_cast<TMoments>(y2) * y; |
| 101 | + const TMoments res = butterflyWarpReduction<float>(r[0]); |
| 102 | + if (res) { |
| 103 | + smem[threadIdx.y][0] = res; //0th |
| 104 | + smem[threadIdx.y][1] = butterflyWarpReduction(r[1]); //1st |
| 105 | + smem[threadIdx.y][2] = y * res; //1st |
| 106 | + if (nMoments >= n12) { |
| 107 | + smem[threadIdx.y][3] = butterflyWarpReduction(r[2]); //2nd |
| 108 | + smem[threadIdx.y][4] = smem[threadIdx.y][1] * y; //2nd |
| 109 | + smem[threadIdx.y][5] = y2 * res; //2nd |
| 110 | + } |
| 111 | + if (nMoments >= n123) { |
| 112 | + smem[threadIdx.y][6] = butterflyWarpReduction(r[3]); //3rd |
| 113 | + smem[threadIdx.y][7] = smem[threadIdx.y][3] * y; //3rd |
| 114 | + smem[threadIdx.y][8] = smem[threadIdx.y][1] * y2; //3rd |
| 115 | + smem[threadIdx.y][9] = y3 * res; //3rd |
| 116 | + } |
| 117 | + } |
| 118 | + __syncthreads(); |
| 119 | + |
| 120 | + if (threadIdx.x < blockSizeY && threadIdx.y < nMoments) |
| 121 | + smem[threadIdx.y][nMoments] = butterflyHalfWarpReduction(smem[threadIdx.x][threadIdx.y]); |
| 122 | + __syncthreads(); |
| 123 | + |
| 124 | + if (threadIdx.y == 0 && threadIdx.x < nMoments) { |
| 125 | + if (smem[threadIdx.x][nMoments]) |
| 126 | + cudev::atomicAdd(&moments[threadIdx.x], smem[threadIdx.x][nMoments]); |
| 127 | + } |
| 128 | +} |
| 129 | + |
| 130 | +template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcherNonChar { |
| 131 | + static void call(const PtrStepSz<TSrc> src, PtrStepSz<TMoments> moments, const bool binary, const int offsetX, const cudaStream_t stream) { |
| 132 | + dim3 blockSize(blockSizeX, blockSizeY); |
| 133 | + dim3 gridSize = dim3(divUp(src.rows, blockSizeY)); |
| 134 | + spatialMoments<TSrc, TMoments, false, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr()); |
| 135 | + if (stream == 0) |
| 136 | + cudaSafeCall(cudaStreamSynchronize(stream)); |
| 137 | + }; |
| 138 | +}; |
| 139 | + |
| 140 | +template <typename TSrc, int nMoments> struct momentsDispatcherChar { |
| 141 | + static void call(const PtrStepSz<TSrc> src, PtrStepSz<float> moments, const bool binary, const int offsetX, const cudaStream_t stream) { |
| 142 | + dim3 blockSize(blockSizeX, blockSizeY); |
| 143 | + dim3 gridSize = dim3(divUp(src.rows, blockSizeY)); |
| 144 | + if (offsetX) |
| 145 | + spatialMoments<TSrc, float, true, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr(), offsetX); |
| 146 | + else |
| 147 | + spatialMoments<TSrc, float, true, true, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr()); |
| 148 | + |
| 149 | + if (stream == 0) |
| 150 | + cudaSafeCall(cudaStreamSynchronize(stream)); |
| 151 | + }; |
| 152 | +}; |
| 153 | + |
| 154 | +template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcher : momentsDispatcherNonChar<TSrc, TMoments, nMoments> {}; |
| 155 | +template <int nMoments> struct momentsDispatcher<uchar, float, nMoments> : momentsDispatcherChar<uchar, nMoments> {}; |
| 156 | +template <int nMoments> struct momentsDispatcher<schar, float, nMoments> : momentsDispatcherChar<schar, nMoments> {}; |
| 157 | + |
| 158 | +template <typename TSrc, typename TMoments> |
| 159 | +void moments(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream) { |
| 160 | + if (order == 1) |
| 161 | + momentsDispatcher<TSrc, TMoments, n1>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream); |
| 162 | + else if (order == 2) |
| 163 | + momentsDispatcher<TSrc, TMoments, n12>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream); |
| 164 | + else if (order == 3) |
| 165 | + momentsDispatcher<TSrc, TMoments, n123>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream); |
| 166 | +}; |
| 167 | + |
| 168 | +template void moments<uchar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 169 | +template void moments<schar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 170 | +template void moments<ushort, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 171 | +template void moments<short, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 172 | +template void moments<int, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 173 | +template void moments<float, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 174 | +template void moments<double, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 175 | + |
| 176 | +template void moments<uchar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 177 | +template void moments<schar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 178 | +template void moments<ushort, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 179 | +template void moments<short, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 180 | +template void moments<int, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 181 | +template void moments<float, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 182 | +template void moments<double, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream); |
| 183 | + |
| 184 | +}}}} |
| 185 | + |
| 186 | +#endif /* CUDA_DISABLER */ |
0 commit comments