|
2 | 2 | #include "filter.hpp"
|
3 | 3 | #include <stdlib.h>
|
4 | 4 | #include <unistd.h>
|
| 5 | +#include <thrust/extrema.h> |
| 6 | + |
| 7 | + |
| 8 | +#define DEBUG |
5 | 9 |
|
6 | 10 | // TODO: move this into common
|
7 | 11 | __device__ int ceilDivGPU(int a, int b) {
|
@@ -38,7 +42,7 @@ __global__ void ransacKernel(GPU_Cloud pc, float* inlierCounts, int* modelPoints
|
38 | 42 | __shared__ float inlierField[MAX_THREADS];
|
39 | 43 | inlierField[threadIdx.x] = 0;
|
40 | 44 |
|
41 |
| - int iteration = blockIdx.x; //which "iteration" |
| 45 | + int iteration = blockIdx.x; //which "iteration" of RANSAC |
42 | 46 | float inliers = 0; //number of inliers in this thread
|
43 | 47 |
|
44 | 48 | // select 3 random points from the cloud as the model that this particular block will evaluate
|
@@ -101,78 +105,36 @@ __global__ void ransacKernel(GPU_Cloud pc, float* inlierCounts, int* modelPoints
|
101 | 105 | }
|
102 | 106 | }
|
103 | 107 |
|
104 |
| -//to avoid kernel launch time, this could actually be appended to the bottom of the ransacKernel, |
105 |
| -//after a syncthreads() call. But for now it will be left seperate for the purpose of clarity. |
106 |
| -//kernel launch time is likely in tens of microseconds. TODO test to confirm this theory |
107 |
| -/* |
108 |
| -LAUNCH: |
109 |
| - - [Block] 1 |
110 |
| - - [Thread] Number of attempted models ("iterations") |
111 |
| -
|
112 |
| -REQUIRES: |
113 |
| - - Buffer with inlier counts for each attempted model in RANSAC |
114 |
| - - Output in memory the 3 points of the selected model |
| 108 | + /** |
| 109 | + * \brief Updates the plane selection from the cloud using the given model index |
| 110 | + */ |
| 111 | +__global__ void getOptimalModelPoints(GPU_Cloud pc, Plane &selection, int idx, int* modelPoints, float* maxCount) { |
| 112 | + int pt = threadIdx.x; |
| 113 | + float4 point = pc.data[modelPoints[3*idx + pt]]; |
| 114 | + selection[pt] = make_float3(point.x, point.y, point.z); |
115 | 115 |
|
116 |
| -EFFECTS: |
117 |
| - - Selects the optimal model (the one with the greatest inlier count) |
118 |
| - - Outputs the points of this model |
119 |
| -*/ |
120 |
| -// optimalMOdel out = { p1.x, p1.y, p1.z, p2.x, p2.y, p2.z, p3.x, p3.y, p3.z} |
121 |
| -__global__ void selectOptimalRansacModel(GPU_Cloud pc, float* inlierCounts, int* modelPoints, Plane& optimalModelOut, int iterations, int* optimalModelIndex) { |
122 |
| - |
123 |
| - __shared__ float inlierCountsLocal[MAX_THREADS]; |
124 |
| - __shared__ int modelIndiciesLocal[MAX_THREADS]; |
125 |
| - |
126 |
| - //TODO: This can easily index out of bounds if threadIdx.x > numPoints in the PC |
127 |
| - //another problem: we must initalize the inlierCountsLocal with low valeus that wont be chosen |
128 |
| - |
129 |
| - // Populate the locally defined arrays |
130 |
| - float inliers = (threadIdx.x < iterations) ? inlierCounts[threadIdx.x] : 0; |
131 |
| - int optimalModel = threadIdx.x; |
132 |
| - inlierCountsLocal[threadIdx.x] = inliers; |
133 |
| - modelIndiciesLocal[threadIdx.x] = optimalModel; |
| 116 | + // Use one thread to compute the normal |
134 | 117 | __syncthreads();
|
135 |
| - |
136 |
| - // Parallel reduction to determine the model with the largest number of inliers |
137 |
| - int aliveThreads = (blockDim.x) / 2; |
138 |
| - while (aliveThreads > 0) { |
139 |
| - if (threadIdx.x < aliveThreads) { |
140 |
| - int temp = max(inlierCountsLocal[aliveThreads + threadIdx.x], inliers); |
141 |
| - if(temp > inliers) { |
142 |
| - inliers = temp; |
143 |
| - optimalModel = modelIndiciesLocal[aliveThreads + threadIdx.x]; |
144 |
| - } |
145 |
| - |
146 |
| - if (threadIdx.x >= (aliveThreads) / 2) { |
147 |
| - modelIndiciesLocal[threadIdx.x] = optimalModel; |
148 |
| - inlierCountsLocal[threadIdx.x] = inliers; |
149 |
| - } |
150 |
| - } |
151 |
| - __syncthreads(); |
152 |
| - aliveThreads /= 2; |
153 |
| - } |
154 |
| - |
155 |
| - //at the final thread, write to global memory |
156 |
| - if(threadIdx.x < 3) { |
157 |
| - float3 pt = make_float3(pc.data[ modelPoints[modelIndiciesLocal[0]*3 + threadIdx.x] ].x, pc.data[ modelPoints[modelIndiciesLocal[0]*3 + threadIdx.x] ].y, pc.data[ modelPoints[modelIndiciesLocal[0]*3 + threadIdx.x] ].z); |
158 |
| - |
159 |
| - // Set output model |
160 |
| - optimalModelOut[threadIdx.x] = pt; |
161 |
| - } |
162 |
| - |
163 |
| - __syncthreads(); |
164 |
| - |
165 | 118 | if(threadIdx.x == 0) {
|
166 |
| - // Find normal to the plane |
167 |
| - optimalModelOut.ComputeNormal(); |
| 119 | + selection.ComputeNormal(); |
168 | 120 |
|
169 |
| - printf("winner model inlier count: %f \n", inlierCountsLocal[0]); |
170 |
| - |
171 |
| - //check here if the inlier counts local is 0, if so return -1 instead |
172 |
| - *optimalModelIndex = (inlierCountsLocal[0] > 1.0) ? modelIndiciesLocal[0] : -1; |
| 121 | + #ifdef DEBUG |
| 122 | + printf("Winner model inlier count: %f \n", *maxCount); |
| 123 | + #endif |
173 | 124 | }
|
174 | 125 | }
|
175 | 126 |
|
| 127 | +void RansacPlane::selectOptimalModel() { |
| 128 | + float* maxCount = thrust::max_element(thrust::device, inlierCounts, inlierCounts + iterations); |
| 129 | + // Pointer arithmetic gives us the model index with most inliers |
| 130 | + int maxIdx = maxCount - inlierCounts; |
| 131 | + // Send the index to GPU |
| 132 | + cudaMemcpy(optimalModelIndex, &maxIdx , sizeof(int), cudaMemcpyHostToDevice); |
| 133 | + // Now launch a kernel to write the Plane of this model into selection |
| 134 | + getOptimalModelPoints<<<1, 3>>>(pc, *selection, maxIdx, modelPoints, maxCount); |
| 135 | + checkStatus(cudaDeviceSynchronize()); |
| 136 | +} |
| 137 | + |
176 | 138 | RansacPlane::RansacPlane(float3 axis, float epsilon, int iterations, float threshold, int pcSize, float removalRadius)
|
177 | 139 | : pc(pc), axis(axis), epsilon(epsilon), iterations(iterations), threshold(threshold), removalRadius(removalRadius) {
|
178 | 140 |
|
@@ -221,7 +183,8 @@ Plane RansacPlane::computeModel(GPU_Cloud &pc) {
|
221 | 183 | checkStatus(cudaDeviceSynchronize());
|
222 | 184 |
|
223 | 185 | // Choose the model with the greatest inlier count
|
224 |
| - selectOptimalRansacModel<<<1, MAX_THREADS>>>(pc, inlierCounts, modelPoints, *selection, iterations, optimalModelIndex); |
| 186 | + selectOptimalModel(); |
| 187 | + |
225 | 188 | checkStatus(cudaGetLastError());
|
226 | 189 | checkStatus(cudaDeviceSynchronize());
|
227 | 190 |
|
|
0 commit comments