|
2 | 2 | #include "filter.hpp" |
3 | 3 | #include <stdlib.h> |
4 | 4 | #include <unistd.h> |
| 5 | +#include <thrust/extrema.h> |
| 6 | + |
| 7 | + |
| 8 | +#define DEBUG |
5 | 9 |
|
6 | 10 | // TODO: move this into common |
7 | 11 | __device__ int ceilDivGPU(int a, int b) { |
@@ -38,7 +42,7 @@ __global__ void ransacKernel(GPU_Cloud pc, float* inlierCounts, int* modelPoints |
38 | 42 | __shared__ float inlierField[MAX_THREADS]; |
39 | 43 | inlierField[threadIdx.x] = 0; |
40 | 44 |
|
41 | | - int iteration = blockIdx.x; //which "iteration" |
| 45 | + int iteration = blockIdx.x; //which "iteration" of RANSAC |
42 | 46 | float inliers = 0; //number of inliers in this thread |
43 | 47 |
|
44 | 48 | // select 3 random points from the cloud as the model that this particular block will evaluate |
@@ -101,78 +105,36 @@ __global__ void ransacKernel(GPU_Cloud pc, float* inlierCounts, int* modelPoints |
101 | 105 | } |
102 | 106 | } |
103 | 107 |
|
104 | | -//to avoid kernel launch time, this could actually be appended to the bottom of the ransacKernel, |
105 | | -//after a syncthreads() call. But for now it will be left seperate for the purpose of clarity. |
106 | | -//kernel launch time is likely in tens of microseconds. TODO test to confirm this theory |
107 | | -/* |
108 | | -LAUNCH: |
109 | | - - [Block] 1 |
110 | | - - [Thread] Number of attempted models ("iterations") |
111 | | -
|
112 | | -REQUIRES: |
113 | | - - Buffer with inlier counts for each attempted model in RANSAC |
114 | | - - Output in memory the 3 points of the selected model |
| 108 | + /** |
| 109 | + * \brief Updates the plane selection from the cloud using the given model index |
| 110 | + */ |
| 111 | +__global__ void getOptimalModelPoints(GPU_Cloud pc, Plane &selection, int idx, int* modelPoints, float* maxCount) { |
| 112 | + int pt = threadIdx.x; |
| 113 | + float4 point = pc.data[modelPoints[3*idx + pt]]; |
| 114 | + selection[pt] = make_float3(point.x, point.y, point.z); |
115 | 115 |
|
116 | | -EFFECTS: |
117 | | - - Selects the optimal model (the one with the greatest inlier count) |
118 | | - - Outputs the points of this model |
119 | | -*/ |
120 | | -// optimalMOdel out = { p1.x, p1.y, p1.z, p2.x, p2.y, p2.z, p3.x, p3.y, p3.z} |
121 | | -__global__ void selectOptimalRansacModel(GPU_Cloud pc, float* inlierCounts, int* modelPoints, Plane& optimalModelOut, int iterations, int* optimalModelIndex) { |
122 | | - |
123 | | - __shared__ float inlierCountsLocal[MAX_THREADS]; |
124 | | - __shared__ int modelIndiciesLocal[MAX_THREADS]; |
125 | | - |
126 | | - //TODO: This can easily index out of bounds if threadIdx.x > numPoints in the PC |
127 | | - //another problem: we must initalize the inlierCountsLocal with low valeus that wont be chosen |
128 | | - |
129 | | - // Populate the locally defined arrays |
130 | | - float inliers = (threadIdx.x < iterations) ? inlierCounts[threadIdx.x] : 0; |
131 | | - int optimalModel = threadIdx.x; |
132 | | - inlierCountsLocal[threadIdx.x] = inliers; |
133 | | - modelIndiciesLocal[threadIdx.x] = optimalModel; |
| 116 | + // Use one thread to compute the normal |
134 | 117 | __syncthreads(); |
135 | | - |
136 | | - // Parallel reduction to determine the model with the largest number of inliers |
137 | | - int aliveThreads = (blockDim.x) / 2; |
138 | | - while (aliveThreads > 0) { |
139 | | - if (threadIdx.x < aliveThreads) { |
140 | | - int temp = max(inlierCountsLocal[aliveThreads + threadIdx.x], inliers); |
141 | | - if(temp > inliers) { |
142 | | - inliers = temp; |
143 | | - optimalModel = modelIndiciesLocal[aliveThreads + threadIdx.x]; |
144 | | - } |
145 | | - |
146 | | - if (threadIdx.x >= (aliveThreads) / 2) { |
147 | | - modelIndiciesLocal[threadIdx.x] = optimalModel; |
148 | | - inlierCountsLocal[threadIdx.x] = inliers; |
149 | | - } |
150 | | - } |
151 | | - __syncthreads(); |
152 | | - aliveThreads /= 2; |
153 | | - } |
154 | | - |
155 | | - //at the final thread, write to global memory |
156 | | - if(threadIdx.x < 3) { |
157 | | - float3 pt = make_float3(pc.data[ modelPoints[modelIndiciesLocal[0]*3 + threadIdx.x] ].x, pc.data[ modelPoints[modelIndiciesLocal[0]*3 + threadIdx.x] ].y, pc.data[ modelPoints[modelIndiciesLocal[0]*3 + threadIdx.x] ].z); |
158 | | - |
159 | | - // Set output model |
160 | | - optimalModelOut[threadIdx.x] = pt; |
161 | | - } |
162 | | - |
163 | | - __syncthreads(); |
164 | | - |
165 | 118 | if(threadIdx.x == 0) { |
166 | | - // Find normal to the plane |
167 | | - optimalModelOut.ComputeNormal(); |
| 119 | + selection.ComputeNormal(); |
168 | 120 |
|
169 | | - printf("winner model inlier count: %f \n", inlierCountsLocal[0]); |
170 | | - |
171 | | - //check here if the inlier counts local is 0, if so return -1 instead |
172 | | - *optimalModelIndex = (inlierCountsLocal[0] > 1.0) ? modelIndiciesLocal[0] : -1; |
| 121 | + #ifdef DEBUG |
| 122 | + printf("Winner model inlier count: %f \n", *maxCount); |
| 123 | + #endif |
173 | 124 | } |
174 | 125 | } |
175 | 126 |
|
| 127 | +void RansacPlane::selectOptimalModel() { |
| 128 | + float* maxCount = thrust::max_element(thrust::device, inlierCounts, inlierCounts + iterations); |
| 129 | + // Pointer arithmetic gives us the model index with most inliers |
| 130 | + int maxIdx = maxCount - inlierCounts; |
| 131 | + // Send the index to GPU |
| 132 | + cudaMemcpy(optimalModelIndex, &maxIdx , sizeof(int), cudaMemcpyHostToDevice); |
| 133 | + // Now launch a kernel to write the Plane of this model into selection |
| 134 | + getOptimalModelPoints<<<1, 3>>>(pc, *selection, maxIdx, modelPoints, maxCount); |
| 135 | + checkStatus(cudaDeviceSynchronize()); |
| 136 | +} |
| 137 | + |
176 | 138 | RansacPlane::RansacPlane(float3 axis, float epsilon, int iterations, float threshold, int pcSize, float removalRadius) |
177 | 139 | : pc(pc), axis(axis), epsilon(epsilon), iterations(iterations), threshold(threshold), removalRadius(removalRadius) { |
178 | 140 |
|
@@ -221,7 +183,8 @@ Plane RansacPlane::computeModel(GPU_Cloud &pc) { |
221 | 183 | checkStatus(cudaDeviceSynchronize()); |
222 | 184 |
|
223 | 185 | // Choose the model with the greatest inlier count |
224 | | - selectOptimalRansacModel<<<1, MAX_THREADS>>>(pc, inlierCounts, modelPoints, *selection, iterations, optimalModelIndex); |
| 186 | + selectOptimalModel(); |
| 187 | + |
225 | 188 | checkStatus(cudaGetLastError()); |
226 | 189 | checkStatus(cudaDeviceSynchronize()); |
227 | 190 |
|
|
0 commit comments