77
88#define DEBUG
99
10- // TODO: move this into common
10+ // does ceiling division
1111__device__ int ceilDivGPU (int a, int b) {
1212 return (a + b - 1 ) / b;
1313}
1414
15+ /* *
16+ * Launch with iteration [aka, number of candidate models] many blocks and MAX_THREADS
17+ * \brief Runs through RANSAC 'iterations' and computes the number of inliers for each iteration's candidate
18+ * \param pc Point cloud to process
19+ * \param candidatePlanePoints Array len(iterations) of randomly generated candidate planes. Each elt is an int3 containing
20+ * the INDICIES of three points from the point cloud data array which form a candidate plane
21+ * \param threshold How far a point may be from the candidate plane to be considered an inlier
22+ * \param axis The axis normal to the plane we are looking for
23+ * \param epsilon How far off the given axis a candidate plane can be without being rejected
24+ * \param inlierCountOut An output array len(iterations) where the number of inliers for each candidate tried is stored
25+ */
26+
1527/*
16- LAUNCH:
17- - [Block] # iterations [aka, randomly selected models] to try
18- - [Thread] MAX_THREADS
19-
20- REQUIRES:
21- - GPU point cloud
22- - A buffer to write inlier counts for each attempted model
23- - A buffer that tells the kernel what the randomly selected points were for each model
24- - Threshold distance for a pt to be considered an inlier
25- EFFECTS:
26-
27- [Block]:
2828 Each block represents an "iteration" of the traditional RANSAC algorithm.
29- That is, every block has a different set of randomly chosen 3 points that define the
30- model (a plane is minimally defined by 3 points). The threads in the block then check
31- every point in the point cloud against the model for that block.
32-
33- [Thread]:
34- Threads are used to decide how many points are inliers to the model. If
35- thread max = 1024 and there are 2048 points, each thread will process 2
36- points. Each thread will write the number of inliers in the set of points it evaluated
37- to its specific spot in shared memory. Threads are synced, and then threads will
38- participate in a parallel reduction to give the total number of inliers for that block/model,
39- which will be returned from the kernel in the inlierCounts buffer.
29+ The threads in the block then check every point in the point cloud against
30+ the candidate for that iteration. Each thread is responsible for
31+ total_points/num_threads many points. The thread computes how many of its
32+ own points are inliers. It then participates in parallel reduction to give
33+ the total number of inliers for that iteration in the output buffer
4034*/
41- __global__ void ransacKernel (GPU_Cloud pc, float * inlierCounts, int * modelPoints, float threshold, float3 axis, float epsilon) {
42- __shared__ float inlierField[MAX_THREADS];
43- inlierField[threadIdx .x ] = 0 ;
4435
45- int iteration = blockIdx .x ; // which "iteration" of RANSAC
46- float inliers = 0 ; // number of inliers in this thread
36+ __global__ void ransacKernel (GPU_Cloud pc, int3 * candidatePlanePoints, float threshold, float3 axis, float epsilon, int * inlierCountsOut) {
37+ // Set up internal inlier counts for each thread of this block to write to
38+ __shared__ float inlierCountInternal[MAX_THREADS];
39+ inlierCountInternal[threadIdx .x ] = 0 ;
40+
41+ // Which "iteration" of RANSAC
42+ int iteration = blockIdx .x ;
43+ int3 candidateModelIndicies = candidatePlanePoints[iteration];
4744
48- // select 3 random points from the cloud as the model that this particular block will evaluate
49- int randIdx0 = modelPoints[iteration*3 + 0 ];
50- int randIdx1 = modelPoints[iteration*3 + 1 ];
51- int randIdx2 = modelPoints[iteration*3 + 2 ];
45+ // Number of inliers in this thread
46+ float inliers = 0 ;
5247
53- if (randIdx0 >= pc.size || randIdx1 >= pc.size || randIdx2 >= pc.size ) {
54- inlierCounts[iteration] = 0 ;
48+ // Verify that this candidate plane is actually valid and doesn't consist of a point that goes out of bounds
49+ if (candidateModelIndicies.x >= pc.size || candidateModelIndicies.y >= pc.size || candidateModelIndicies.z >= pc.size ) {
50+ // Mark as an invalid model so its not chosen
51+ inlierCountsOut[iteration] = -1 ;
5552 return ;
5653 }
5754
58- float3 modelPt0 = make_float3 (pc.data [randIdx0]. x , pc. data [randIdx0]. y , pc. data [randIdx0]. z );
59- float3 modelPt1 = make_float3 (pc.data [randIdx1]. x , pc. data [randIdx1]. y , pc. data [randIdx1]. z );
60- float3 modelPt2 = make_float3 (pc.data [randIdx2]. x , pc. data [randIdx2]. y , pc. data [randIdx2]. z );
55+ float3 modelPt0 = make_float3 (pc.data [candidateModelIndicies. x ] );
56+ float3 modelPt1 = make_float3 (pc.data [candidateModelIndicies. y ] );
57+ float3 modelPt2 = make_float3 (pc.data [candidateModelIndicies. z ] );
6158
62- // Create a plane from the 3 points
59+ // Create the candidate plane from the 3 points
6360 Plane plane (modelPt0, modelPt1, modelPt2);
6461
65- // check that n dot desired axis is less than epsilon, if so, return here
62+ // Verify that the candidate plane's normal is within epsilon of desired axis
6663 if (abs (dot (normalize (plane.normal ), normalize (axis))) < epsilon) {
67- if (threadIdx .x == 0 ) inlierCounts[iteration] = 0 ; // make it -1 to show invalid model?
64+ // Mark as an invalid model so its not chosen
65+ inlierCountsOut[iteration] = 0 ;
6866 return ;
6967 }
7068
7169 // Construct predicate to chek if a point is an inlier in the plane
7270 NotInPlane pred (plane.normal , modelPt1, threshold);
7371
74- // figure out how many points each thread must compute distance for and determine if each is inlier/outlier
72+ // Figure out how many points each thread must check
7573 int pointsPerThread = ceilDivGPU (pc.size , MAX_THREADS);
7674 for (int i = 0 ; i < pointsPerThread; i++) {
77- // select a point index or return if this isn't a valid point
75+ // Select a point index or return if this isn't a valid point
7876 int pointIdx = threadIdx .x * pointsPerThread + i;
79- if (pointIdx >= pc.size ) continue ; // TODO Should this be return???
77+ if (pointIdx >= pc.size ) continue ;
8078
81- // point in the point cloud that could be an inlier or outlier
82- float4 curPt = make_float4 ( pc.data [pointIdx]. x , pc. data [pointIdx]. y , pc. data [pointIdx]. z , 0 ) ;
79+ // Point in the point cloud that could be an inlier or outlier
80+ float4 curPt = pc.data [pointIdx];
8381
84- // add a 1 if inlier in plane, 0 if not
85- inliers += (pred (curPt)) ? 0 : 1 ; // very probalmatic line, how can we reduce these checks
82+ // Add a 1 if inlier in plane, 0 if not
83+ inliers += (pred (curPt)) ? 0 : 1 ;
8684 }
8785
88- // parallel reduction to get an aggregate sum of the number of inliers for this model
89- // this is all equivalent to sum(inlierField), but it does it in parallel
90- inlierField[threadIdx .x ] = inliers;
86+ // Load the inliers for this thread into the shared memory that all threads can access
87+ inlierCountInternal[threadIdx .x ] = inliers;
9188 __syncthreads ();
89+
90+ // Parallel reduction to get an aggregate sum of the number of inliers for this model
91+ // This is all equivalent to sum(inlierCountInternal), but it does it in parallel
9292 int aliveThreads = (blockDim .x ) / 2 ;
9393 while (aliveThreads > 0 ) {
9494 if (threadIdx .x < aliveThreads) {
95- inliers += inlierField [aliveThreads + threadIdx .x ];
96- if (threadIdx .x >= (aliveThreads) / 2 ) inlierField [threadIdx .x ] = inliers;
95+ inliers += inlierCountInternal [aliveThreads + threadIdx .x ];
96+ if (threadIdx .x >= (aliveThreads) / 2 ) inlierCountInternal [threadIdx .x ] = inliers;
9797 }
9898 __syncthreads ();
9999 aliveThreads /= 2 ;
100100 }
101101
102- // at the final thread, write to global memory
102+ // At the final thread, write the number of inliers for this iteration's model to global memory
103103 if (threadIdx .x == 0 ) {
104- inlierCounts [iteration] = inliers;
104+ inlierCountsOut [iteration] = inliers;
105105 }
106106}
107107
108108 /* *
109109 * \brief Updates the plane selection from the cloud using the given model index
110110 */
111- __global__ void getOptimalModelPoints (GPU_Cloud pc, Plane &selection, int idx, int * modelPoints, float * maxCount) {
112- int pt = threadIdx .x ;
113- float4 point = pc.data [modelPoints[3 *idx + pt]];
114- selection[pt] = make_float3 (point.x , point.y , point.z );
111+ __global__ void getOptimalModelPoints (GPU_Cloud pc, Plane &selection, int optimalCandidateIndex, int3 * candidatePlanePoints, int * maxInliersCount) {
112+ // Each thread is responsible for one point in the output candidate plane, there will be 3 threads
113+ int candidatePtNum = threadIdx .x ;
114+
115+ // Copy the point from the cloud into the output plane object
116+ float4 point;
117+ if (candidatePtNum == 0 ) point = pc.data [candidatePlanePoints[optimalCandidateIndex].x ];
118+ if (candidatePtNum == 1 ) point = pc.data [candidatePlanePoints[optimalCandidateIndex].y ];
119+ if (candidatePtNum == 2 ) point = pc.data [candidatePlanePoints[optimalCandidateIndex].z ];
120+
121+ selection[candidatePtNum] = make_float3 (point);
115122
116123 // Use one thread to compute the normal
117124 __syncthreads ();
118125 if (threadIdx .x == 0 ) {
119126 selection.ComputeNormal ();
120127
121128 #ifdef DEBUG
122- printf (" Winner model inlier count: %f \n " , *maxCount );
129+ printf (" Winner model inlier count: %d \n " , *maxInliersCount );
123130 #endif
124131 }
125132}
126133
127134void RansacPlane::selectOptimalModel () {
128- float * maxCount = thrust::max_element (thrust::device, inlierCounts, inlierCounts + iterations);
129- // Pointer arithmetic gives us the model index with most inliers
130- int maxIdx = maxCount - inlierCounts;
131- // Send the index to GPU
132- cudaMemcpy (optimalModelIndex, &maxIdx , sizeof (int ), cudaMemcpyHostToDevice);
133- // Now launch a kernel to write the Plane of this model into selection
134- getOptimalModelPoints<<<1 , 3 >>> (pc, *selection, maxIdx, modelPoints, maxCount);
135+ // Get a pointer to the elt in inlierCounts that is the greatest
136+ int * maxCount = thrust::max_element (thrust::device, inlierCounts, inlierCounts + iterations);
137+ // Pointer arithmetic gives us the optimal model index with most inliers
138+ int optIdx = maxCount - inlierCounts;
139+ // Now launch a kernel to construct the Plane object 'selection'
140+ getOptimalModelPoints<<<1 , 3 >>> (pc, *selection, optIdx, candidatePlanePoints, maxCount);
135141 checkStatus (cudaDeviceSynchronize ());
136142}
137143
138144RansacPlane::RansacPlane (float3 axis, float epsilon, int iterations, float threshold, int pcSize, float removalRadius)
139145 : pc(pc), axis(axis), epsilon(epsilon), iterations(iterations), threshold(threshold), removalRadius(removalRadius) {
140146
141147 // Set up buffers needed for RANSAC
142- cudaMalloc (&inlierCounts, sizeof (float ) * iterations);
143- cudaMalloc (&modelPoints , sizeof (int ) * iterations * 3 );
148+ cudaMalloc (&inlierCounts, sizeof (int ) * iterations);
149+ cudaMalloc (&candidatePlanePoints , sizeof (int3 ) * iterations);
144150 cudaMalloc (&selection, sizeof (Plane));
145- cudaMalloc (&optimalModelIndex, sizeof (int ));
146151
147152 // Generate random numbers in CPU to use in RANSAC kernel
148- int * randomNumsCPU = (int *) malloc (sizeof (int ) * iterations* 3 );
149-
153+ int3 * randomNumsCPU = (int3 *) malloc (sizeof (int3 ) * iterations);
150154 for (int i = 0 ; i < iterations; i++) {
151155 int a = 0 ;
152156 int b = 0 ;
153157 int c = 0 ;
158+ // Prevent duplicate points in a particular model
154159 while (a == b || b == c || a == c) {
155160 a = rand () % pcSize;
156161 b = rand () % pcSize;
157162 c = rand () % pcSize;
158163 }
159-
160- randomNumsCPU[i*3 ] = a;
161- randomNumsCPU[i*3 + 1 ] = b;
162- randomNumsCPU[i*3 + 2 ] = c;
164+ randomNumsCPU[i].x = a;
165+ randomNumsCPU[i].y = b;
166+ randomNumsCPU[i].z = c;
163167 }
164-
165- cudaMemcpy (modelPoints, randomNumsCPU, sizeof (int ) * iterations * 3 , cudaMemcpyHostToDevice);
168+ cudaMemcpy (candidatePlanePoints, randomNumsCPU, sizeof (int3 ) * iterations, cudaMemcpyHostToDevice);
166169 free (randomNumsCPU);
167170
168- // Generate a buffer for retreiving the selected model from CUDA Kernels
169- selectedModel = (Plane*) malloc (sizeof (Plane));
171+ // Generate a CPU buffer for retreiving the selected model from CUDA Kernels
172+ selectionCPU = (Plane*) malloc (sizeof (Plane));
170173}
171174
172175Plane RansacPlane::computeModel (GPU_Cloud &pc) {
173- if (pc.size == 0 ) return {make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 )};
176+ if (pc.size == 0 ) {
177+ std::cout << " [WARNING] Can't run RANSAC on empty plane." << std::endl;
178+ return {make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 )};
179+ }
174180
175181 // Copy vars locally
176182 this ->pc = pc;
177183 int blocks = iterations;
178184 int threads = MAX_THREADS;
179185
180186 // Get a list of models and corresponding inlier count
181- ransacKernel<<<blocks, threads>>> (pc, inlierCounts, modelPoints, threshold, axis, cos (epsilon*3.1415 /180 ));
187+ ransacKernel<<<blocks, threads>>> (pc, candidatePlanePoints, threshold, axis, cos (epsilon*3.1415 /180 ), inlierCounts );
182188 checkStatus (cudaGetLastError ());
183189 checkStatus (cudaDeviceSynchronize ());
184190
185191 // Choose the model with the greatest inlier count
186192 selectOptimalModel ();
187193
188- checkStatus (cudaGetLastError ());
189- checkStatus (cudaDeviceSynchronize ());
190-
191194 // Copy selected plane to CPU
192- cudaMemcpy (selectedModel , selection, sizeof (Plane), cudaMemcpyDeviceToHost);
195+ cudaMemcpy (selectionCPU , selection, sizeof (Plane), cudaMemcpyDeviceToHost);
193196
194197 // Filter out all the points in the plane
195- NotInPlane predicate (selectedModel ->normal , selectedModel ->p1 , threshold);
198+ NotInPlane predicate (selectionCPU ->normal , selectionCPU ->p1 , threshold);
196199 Filter<NotInPlane>(pc, predicate, FilterOp::REMOVE, 0 );
197200 checkStatus (cudaGetLastError ());
198201 checkStatus (cudaDeviceSynchronize ());
199202
200- return *selectedModel ;
203+ return *selectionCPU ;
201204}
202205
203206RansacPlane::~RansacPlane () {
204207 cudaFree (inlierCounts);
205- cudaFree (modelPoints );
208+ cudaFree (candidatePlanePoints );
206209 cudaFree (selection);
207- cudaFree (optimalModelIndex);
208- free (selectedModel);
210+ free (selectionCPU);
209211}
0 commit comments