7
7
8
8
#define DEBUG
9
9
10
- // TODO: move this into common
10
+ // does ceiling division
11
11
__device__ int ceilDivGPU (int a, int b) {
12
12
return (a + b - 1 ) / b;
13
13
}
14
14
15
+ /* *
16
+ * Launch with iteration [aka, number of candidate models] many blocks and MAX_THREADS
17
+ * \brief Runs through RANSAC 'iterations' and computes the number of inliers for each iteration's candidate
18
+ * \param pc Point cloud to process
19
+ * \param candidatePlanePoints Array len(iterations) of randomly generated candidate planes. Each elt is an int3 containing
20
+ * the INDICIES of three points from the point cloud data array which form a candidate plane
21
+ * \param threshold How far a point may be from the candidate plane to be considered an inlier
22
+ * \param axis The axis normal to the plane we are looking for
23
+ * \param epsilon How far off the given axis a candidate plane can be without being rejected
24
+ * \param inlierCountOut An output array len(iterations) where the number of inliers for each candidate tried is stored
25
+ */
26
+
15
27
/*
16
- LAUNCH:
17
- - [Block] # iterations [aka, randomly selected models] to try
18
- - [Thread] MAX_THREADS
19
-
20
- REQUIRES:
21
- - GPU point cloud
22
- - A buffer to write inlier counts for each attempted model
23
- - A buffer that tells the kernel what the randomly selected points were for each model
24
- - Threshold distance for a pt to be considered an inlier
25
- EFFECTS:
26
-
27
- [Block]:
28
28
Each block represents an "iteration" of the traditional RANSAC algorithm.
29
- That is, every block has a different set of randomly chosen 3 points that define the
30
- model (a plane is minimally defined by 3 points). The threads in the block then check
31
- every point in the point cloud against the model for that block.
32
-
33
- [Thread]:
34
- Threads are used to decide how many points are inliers to the model. If
35
- thread max = 1024 and there are 2048 points, each thread will process 2
36
- points. Each thread will write the number of inliers in the set of points it evaluated
37
- to its specific spot in shared memory. Threads are synced, and then threads will
38
- participate in a parallel reduction to give the total number of inliers for that block/model,
39
- which will be returned from the kernel in the inlierCounts buffer.
29
+ The threads in the block then check every point in the point cloud against
30
+ the candidate for that iteration. Each thread is responsible for
31
+ total_points/num_threads many points. The thread computes how many of its
32
+ own points are inliers. It then participates in parallel reduction to give
33
+ the total number of inliers for that iteration in the output buffer
40
34
*/
41
- __global__ void ransacKernel (GPU_Cloud pc, float * inlierCounts, int * modelPoints, float threshold, float3 axis, float epsilon) {
42
- __shared__ float inlierField[MAX_THREADS];
43
- inlierField[threadIdx .x ] = 0 ;
44
35
45
- int iteration = blockIdx .x ; // which "iteration" of RANSAC
46
- float inliers = 0 ; // number of inliers in this thread
36
+ __global__ void ransacKernel (GPU_Cloud pc, int3 * candidatePlanePoints, float threshold, float3 axis, float epsilon, int * inlierCountsOut) {
37
+ // Set up internal inlier counts for each thread of this block to write to
38
+ __shared__ float inlierCountInternal[MAX_THREADS];
39
+ inlierCountInternal[threadIdx .x ] = 0 ;
40
+
41
+ // Which "iteration" of RANSAC
42
+ int iteration = blockIdx .x ;
43
+ int3 candidateModelIndicies = candidatePlanePoints[iteration];
47
44
48
- // select 3 random points from the cloud as the model that this particular block will evaluate
49
- int randIdx0 = modelPoints[iteration*3 + 0 ];
50
- int randIdx1 = modelPoints[iteration*3 + 1 ];
51
- int randIdx2 = modelPoints[iteration*3 + 2 ];
45
+ // Number of inliers in this thread
46
+ float inliers = 0 ;
52
47
53
- if (randIdx0 >= pc.size || randIdx1 >= pc.size || randIdx2 >= pc.size ) {
54
- inlierCounts[iteration] = 0 ;
48
+ // Verify that this candidate plane is actually valid and doesn't consist of a point that goes out of bounds
49
+ if (candidateModelIndicies.x >= pc.size || candidateModelIndicies.y >= pc.size || candidateModelIndicies.z >= pc.size ) {
50
+ // Mark as an invalid model so its not chosen
51
+ inlierCountsOut[iteration] = -1 ;
55
52
return ;
56
53
}
57
54
58
- float3 modelPt0 = make_float3 (pc.data [randIdx0]. x , pc. data [randIdx0]. y , pc. data [randIdx0]. z );
59
- float3 modelPt1 = make_float3 (pc.data [randIdx1]. x , pc. data [randIdx1]. y , pc. data [randIdx1]. z );
60
- float3 modelPt2 = make_float3 (pc.data [randIdx2]. x , pc. data [randIdx2]. y , pc. data [randIdx2]. z );
55
+ float3 modelPt0 = make_float3 (pc.data [candidateModelIndicies. x ] );
56
+ float3 modelPt1 = make_float3 (pc.data [candidateModelIndicies. y ] );
57
+ float3 modelPt2 = make_float3 (pc.data [candidateModelIndicies. z ] );
61
58
62
- // Create a plane from the 3 points
59
+ // Create the candidate plane from the 3 points
63
60
Plane plane (modelPt0, modelPt1, modelPt2);
64
61
65
- // check that n dot desired axis is less than epsilon, if so, return here
62
+ // Verify that the candidate plane's normal is within epsilon of desired axis
66
63
if (abs (dot (normalize (plane.normal ), normalize (axis))) < epsilon) {
67
- if (threadIdx .x == 0 ) inlierCounts[iteration] = 0 ; // make it -1 to show invalid model?
64
+ // Mark as an invalid model so its not chosen
65
+ inlierCountsOut[iteration] = 0 ;
68
66
return ;
69
67
}
70
68
71
69
// Construct predicate to chek if a point is an inlier in the plane
72
70
NotInPlane pred (plane.normal , modelPt1, threshold);
73
71
74
- // figure out how many points each thread must compute distance for and determine if each is inlier/outlier
72
+ // Figure out how many points each thread must check
75
73
int pointsPerThread = ceilDivGPU (pc.size , MAX_THREADS);
76
74
for (int i = 0 ; i < pointsPerThread; i++) {
77
- // select a point index or return if this isn't a valid point
75
+ // Select a point index or return if this isn't a valid point
78
76
int pointIdx = threadIdx .x * pointsPerThread + i;
79
- if (pointIdx >= pc.size ) continue ; // TODO Should this be return???
77
+ if (pointIdx >= pc.size ) continue ;
80
78
81
- // point in the point cloud that could be an inlier or outlier
82
- float4 curPt = make_float4 ( pc.data [pointIdx]. x , pc. data [pointIdx]. y , pc. data [pointIdx]. z , 0 ) ;
79
+ // Point in the point cloud that could be an inlier or outlier
80
+ float4 curPt = pc.data [pointIdx];
83
81
84
- // add a 1 if inlier in plane, 0 if not
85
- inliers += (pred (curPt)) ? 0 : 1 ; // very probalmatic line, how can we reduce these checks
82
+ // Add a 1 if inlier in plane, 0 if not
83
+ inliers += (pred (curPt)) ? 0 : 1 ;
86
84
}
87
85
88
- // parallel reduction to get an aggregate sum of the number of inliers for this model
89
- // this is all equivalent to sum(inlierField), but it does it in parallel
90
- inlierField[threadIdx .x ] = inliers;
86
+ // Load the inliers for this thread into the shared memory that all threads can access
87
+ inlierCountInternal[threadIdx .x ] = inliers;
91
88
__syncthreads ();
89
+
90
+ // Parallel reduction to get an aggregate sum of the number of inliers for this model
91
+ // This is all equivalent to sum(inlierCountInternal), but it does it in parallel
92
92
int aliveThreads = (blockDim .x ) / 2 ;
93
93
while (aliveThreads > 0 ) {
94
94
if (threadIdx .x < aliveThreads) {
95
- inliers += inlierField [aliveThreads + threadIdx .x ];
96
- if (threadIdx .x >= (aliveThreads) / 2 ) inlierField [threadIdx .x ] = inliers;
95
+ inliers += inlierCountInternal [aliveThreads + threadIdx .x ];
96
+ if (threadIdx .x >= (aliveThreads) / 2 ) inlierCountInternal [threadIdx .x ] = inliers;
97
97
}
98
98
__syncthreads ();
99
99
aliveThreads /= 2 ;
100
100
}
101
101
102
- // at the final thread, write to global memory
102
+ // At the final thread, write the number of inliers for this iteration's model to global memory
103
103
if (threadIdx .x == 0 ) {
104
- inlierCounts [iteration] = inliers;
104
+ inlierCountsOut [iteration] = inliers;
105
105
}
106
106
}
107
107
108
108
/* *
109
109
* \brief Updates the plane selection from the cloud using the given model index
110
110
*/
111
- __global__ void getOptimalModelPoints (GPU_Cloud pc, Plane &selection, int idx, int * modelPoints, float * maxCount) {
112
- int pt = threadIdx .x ;
113
- float4 point = pc.data [modelPoints[3 *idx + pt]];
114
- selection[pt] = make_float3 (point.x , point.y , point.z );
111
+ __global__ void getOptimalModelPoints (GPU_Cloud pc, Plane &selection, int optimalCandidateIndex, int3 * candidatePlanePoints, int * maxInliersCount) {
112
+ // Each thread is responsible for one point in the output candidate plane, there will be 3 threads
113
+ int candidatePtNum = threadIdx .x ;
114
+
115
+ // Copy the point from the cloud into the output plane object
116
+ float4 point;
117
+ if (candidatePtNum == 0 ) point = pc.data [candidatePlanePoints[optimalCandidateIndex].x ];
118
+ if (candidatePtNum == 1 ) point = pc.data [candidatePlanePoints[optimalCandidateIndex].y ];
119
+ if (candidatePtNum == 2 ) point = pc.data [candidatePlanePoints[optimalCandidateIndex].z ];
120
+
121
+ selection[candidatePtNum] = make_float3 (point);
115
122
116
123
// Use one thread to compute the normal
117
124
__syncthreads ();
118
125
if (threadIdx .x == 0 ) {
119
126
selection.ComputeNormal ();
120
127
121
128
#ifdef DEBUG
122
- printf (" Winner model inlier count: %f \n " , *maxCount );
129
+ printf (" Winner model inlier count: %d \n " , *maxInliersCount );
123
130
#endif
124
131
}
125
132
}
126
133
127
134
void RansacPlane::selectOptimalModel () {
128
- float * maxCount = thrust::max_element (thrust::device, inlierCounts, inlierCounts + iterations);
129
- // Pointer arithmetic gives us the model index with most inliers
130
- int maxIdx = maxCount - inlierCounts;
131
- // Send the index to GPU
132
- cudaMemcpy (optimalModelIndex, &maxIdx , sizeof (int ), cudaMemcpyHostToDevice);
133
- // Now launch a kernel to write the Plane of this model into selection
134
- getOptimalModelPoints<<<1 , 3 >>> (pc, *selection, maxIdx, modelPoints, maxCount);
135
+ // Get a pointer to the elt in inlierCounts that is the greatest
136
+ int * maxCount = thrust::max_element (thrust::device, inlierCounts, inlierCounts + iterations);
137
+ // Pointer arithmetic gives us the optimal model index with most inliers
138
+ int optIdx = maxCount - inlierCounts;
139
+ // Now launch a kernel to construct the Plane object 'selection'
140
+ getOptimalModelPoints<<<1 , 3 >>> (pc, *selection, optIdx, candidatePlanePoints, maxCount);
135
141
checkStatus (cudaDeviceSynchronize ());
136
142
}
137
143
138
144
RansacPlane::RansacPlane (float3 axis, float epsilon, int iterations, float threshold, int pcSize, float removalRadius)
139
145
: pc(pc), axis(axis), epsilon(epsilon), iterations(iterations), threshold(threshold), removalRadius(removalRadius) {
140
146
141
147
// Set up buffers needed for RANSAC
142
- cudaMalloc (&inlierCounts, sizeof (float ) * iterations);
143
- cudaMalloc (&modelPoints , sizeof (int ) * iterations * 3 );
148
+ cudaMalloc (&inlierCounts, sizeof (int ) * iterations);
149
+ cudaMalloc (&candidatePlanePoints , sizeof (int3 ) * iterations);
144
150
cudaMalloc (&selection, sizeof (Plane));
145
- cudaMalloc (&optimalModelIndex, sizeof (int ));
146
151
147
152
// Generate random numbers in CPU to use in RANSAC kernel
148
- int * randomNumsCPU = (int *) malloc (sizeof (int ) * iterations* 3 );
149
-
153
+ int3 * randomNumsCPU = (int3 *) malloc (sizeof (int3 ) * iterations);
150
154
for (int i = 0 ; i < iterations; i++) {
151
155
int a = 0 ;
152
156
int b = 0 ;
153
157
int c = 0 ;
158
+ // Prevent duplicate points in a particular model
154
159
while (a == b || b == c || a == c) {
155
160
a = rand () % pcSize;
156
161
b = rand () % pcSize;
157
162
c = rand () % pcSize;
158
163
}
159
-
160
- randomNumsCPU[i*3 ] = a;
161
- randomNumsCPU[i*3 + 1 ] = b;
162
- randomNumsCPU[i*3 + 2 ] = c;
164
+ randomNumsCPU[i].x = a;
165
+ randomNumsCPU[i].y = b;
166
+ randomNumsCPU[i].z = c;
163
167
}
164
-
165
- cudaMemcpy (modelPoints, randomNumsCPU, sizeof (int ) * iterations * 3 , cudaMemcpyHostToDevice);
168
+ cudaMemcpy (candidatePlanePoints, randomNumsCPU, sizeof (int3 ) * iterations, cudaMemcpyHostToDevice);
166
169
free (randomNumsCPU);
167
170
168
- // Generate a buffer for retreiving the selected model from CUDA Kernels
169
- selectedModel = (Plane*) malloc (sizeof (Plane));
171
+ // Generate a CPU buffer for retreiving the selected model from CUDA Kernels
172
+ selectionCPU = (Plane*) malloc (sizeof (Plane));
170
173
}
171
174
172
175
Plane RansacPlane::computeModel (GPU_Cloud &pc) {
173
- if (pc.size == 0 ) return {make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 )};
176
+ if (pc.size == 0 ) {
177
+ std::cout << " [WARNING] Can't run RANSAC on empty plane." << std::endl;
178
+ return {make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 ), make_float3 (0 ,0 ,0 )};
179
+ }
174
180
175
181
// Copy vars locally
176
182
this ->pc = pc;
177
183
int blocks = iterations;
178
184
int threads = MAX_THREADS;
179
185
180
186
// Get a list of models and corresponding inlier count
181
- ransacKernel<<<blocks, threads>>> (pc, inlierCounts, modelPoints, threshold, axis, cos (epsilon*3.1415 /180 ));
187
+ ransacKernel<<<blocks, threads>>> (pc, candidatePlanePoints, threshold, axis, cos (epsilon*3.1415 /180 ), inlierCounts );
182
188
checkStatus (cudaGetLastError ());
183
189
checkStatus (cudaDeviceSynchronize ());
184
190
185
191
// Choose the model with the greatest inlier count
186
192
selectOptimalModel ();
187
193
188
- checkStatus (cudaGetLastError ());
189
- checkStatus (cudaDeviceSynchronize ());
190
-
191
194
// Copy selected plane to CPU
192
- cudaMemcpy (selectedModel , selection, sizeof (Plane), cudaMemcpyDeviceToHost);
195
+ cudaMemcpy (selectionCPU , selection, sizeof (Plane), cudaMemcpyDeviceToHost);
193
196
194
197
// Filter out all the points in the plane
195
- NotInPlane predicate (selectedModel ->normal , selectedModel ->p1 , threshold);
198
+ NotInPlane predicate (selectionCPU ->normal , selectionCPU ->p1 , threshold);
196
199
Filter<NotInPlane>(pc, predicate, FilterOp::REMOVE, 0 );
197
200
checkStatus (cudaGetLastError ());
198
201
checkStatus (cudaDeviceSynchronize ());
199
202
200
- return *selectedModel ;
203
+ return *selectionCPU ;
201
204
}
202
205
203
206
RansacPlane::~RansacPlane () {
204
207
cudaFree (inlierCounts);
205
- cudaFree (modelPoints );
208
+ cudaFree (candidatePlanePoints );
206
209
cudaFree (selection);
207
- cudaFree (optimalModelIndex);
208
- free (selectedModel);
210
+ free (selectionCPU);
209
211
}
0 commit comments