Skip to content

Commit 7435c0f

Browse files
author
Carsten Griwodz
committed
Merge branch 'develop' into dev/cmake-lang-cuda
2 parents fbc3080 + 8623b69 commit 7435c0f

File tree

3 files changed

+17
-49
lines changed

3 files changed

+17
-49
lines changed

CMakeLists.txt

-7
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ option(PopSift_BUILD_DOCS "Build PopSift documentation." OFF)
3131
option(PopSift_ERRCHK_AFTER_KERNEL "Synchronize and check CUDA error after every kernel." OFF)
3232
option(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON)
3333
option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON)
34-
option(PopSift_USE_NORMF "The __normf function computes Euclidean distance on large arrays. Fast but stability is uncertain." OFF)
3534
option(PopSift_NVCC_WARNINGS "Switch on several additional warning for CUDA nvcc" OFF)
3635
option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF)
3736
option(BUILD_SHARED_LIBS "Build shared libraries" ON)
@@ -136,12 +135,6 @@ set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
136135
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
137136

138137

139-
if(PopSift_USE_NORMF AND CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
140-
set(PopSift_HAVE_NORMF 1)
141-
else()
142-
set(PopSift_HAVE_NORMF 0)
143-
endif()
144-
145138
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
146139
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
147140
else()

cmake/sift_config.h.in

-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,5 @@
1212
#define POPSIFT_IS_UNDEFINED(F) F() == 0
1313

1414
#define POPSIFT_HAVE_SHFL_DOWN_SYNC() @PopSift_HAVE_SHFL_DOWN_SYNC@
15-
#define POPSIFT_HAVE_NORMF() @PopSift_HAVE_NORMF@
1615
#define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@
1716

src/popsift/s_desc_norm_l2.h

+17-41
Original file line numberDiff line numberDiff line change
@@ -50,24 +50,10 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
5050
float4 descr;
5151
descr = ptr4[threadIdx.x];
5252

53-
#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_NORMF)
54-
// normf() is an elegant function: sqrt(sum_0^127{v^2})
55-
// It exists from CUDA 7.5 but the trouble with CUB on the GTX 980 Ti forces
56-
// us to with CUDA 7.0 right now
57-
5853
float norm;
5954

60-
if( threadIdx.x == 0 ) {
61-
norm = normf( 128, src_desc );
62-
}
63-
__syncthreads();
64-
norm = popsift::shuffle( norm, 0 );
65-
66-
descr.x = min( descr.x, 0.2f*norm );
67-
descr.y = min( descr.y, 0.2f*norm );
68-
descr.z = min( descr.z, 0.2f*norm );
69-
descr.w = min( descr.w, 0.2f*norm );
70-
55+
// 32 threads compute 4 squares each, then shuffle to performing a addition by
56+
// reduction for the sum of 128 squares, result in thread 0
7157
norm = descr.x * descr.x
7258
+ descr.y * descr.y
7359
+ descr.z * descr.z
@@ -77,34 +63,25 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
7763
norm += popsift::shuffle_down( norm, 4 );
7864
norm += popsift::shuffle_down( norm, 2 );
7965
norm += popsift::shuffle_down( norm, 1 );
80-
if( threadIdx.x == 0 ) {
81-
// norm = __fsqrt_rn( norm );
82-
// norm = __fdividef( 512.0f, norm );
83-
norm = __frsqrt_rn( norm ); // inverse square root
84-
norm = scalbnf( norm, d_consts.norm_multi );
85-
}
86-
#else // not HAVE_NORMF
87-
float norm;
8866

89-
norm = descr.x * descr.x
90-
+ descr.y * descr.y
91-
+ descr.z * descr.z
92-
+ descr.w * descr.w;
93-
norm += popsift::shuffle_down( norm, 16 );
94-
norm += popsift::shuffle_down( norm, 8 );
95-
norm += popsift::shuffle_down( norm, 4 );
96-
norm += popsift::shuffle_down( norm, 2 );
97-
norm += popsift::shuffle_down( norm, 1 );
9867
if( threadIdx.x == 0 ) {
99-
norm = __fsqrt_rn( norm );
68+
// compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0
69+
norm = __frsqrt_rn( norm );
10070
}
71+
72+
// spread the inverted norm from thread 0 to all threads in the warp
10173
norm = popsift::shuffle( norm, 0 );
10274

103-
descr.x = min( descr.x, 0.2f*norm );
104-
descr.y = min( descr.y, 0.2f*norm );
105-
descr.z = min( descr.z, 0.2f*norm );
106-
descr.w = min( descr.w, 0.2f*norm );
75+
// quasi-normalize all 128 floats
76+
descr.x = min( descr.x*norm, 0.2f );
77+
descr.y = min( descr.y*norm, 0.2f );
78+
descr.z = min( descr.z*norm, 0.2f );
79+
descr.w = min( descr.w*norm, 0.2f );
10780

81+
// Repeat the procedure, but also add a multiplier. E.g., if the user wants to
82+
// descriptors as bytes rather than floats, multiply by 256 - or even by 512
83+
// for better accuracy, which is OK because a point cannot be a keypoint if more
84+
// than half of its gradient is in a single direction.
10885
norm = descr.x * descr.x
10986
+ descr.y * descr.y
11087
+ descr.z * descr.z
@@ -114,13 +91,12 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
11491
norm += popsift::shuffle_down( norm, 4 );
11592
norm += popsift::shuffle_down( norm, 2 );
11693
norm += popsift::shuffle_down( norm, 1 );
94+
11795
if( threadIdx.x == 0 ) {
118-
// norm = __fsqrt_rn( norm );
119-
// norm = __fdividef( 512.0f, norm );
12096
norm = __frsqrt_rn( norm ); // inverse square root
12197
norm = scalbnf( norm, d_consts.norm_multi );
12298
}
123-
#endif // HAVE_NORMF
99+
124100
norm = popsift::shuffle( norm, 0 );
125101

126102
descr.x = descr.x * norm;

0 commit comments

Comments
 (0)