@@ -50,24 +50,10 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
50
50
float4 descr;
51
51
descr = ptr4[threadIdx.x ];
52
52
53
- #if POPSIFT_IS_DEFINED(POPSIFT_HAVE_NORMF)
54
- // normf() is an elegant function: sqrt(sum_0^127{v^2})
55
- // It exists from CUDA 7.5 but the trouble with CUB on the GTX 980 Ti forces
56
- // us to with CUDA 7.0 right now
57
-
58
53
float norm;
59
54
60
- if ( threadIdx.x == 0 ) {
61
- norm = normf ( 128 , src_desc );
62
- }
63
- __syncthreads ();
64
- norm = popsift::shuffle ( norm, 0 );
65
-
66
- descr.x = min ( descr.x , 0 .2f *norm );
67
- descr.y = min ( descr.y , 0 .2f *norm );
68
- descr.z = min ( descr.z , 0 .2f *norm );
69
- descr.w = min ( descr.w , 0 .2f *norm );
70
-
55
+ // 32 threads compute 4 squares each, then shuffle to performing a addition by
56
+ // reduction for the sum of 128 squares, result in thread 0
71
57
norm = descr.x * descr.x
72
58
+ descr.y * descr.y
73
59
+ descr.z * descr.z
@@ -77,34 +63,25 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
77
63
norm += popsift::shuffle_down ( norm, 4 );
78
64
norm += popsift::shuffle_down ( norm, 2 );
79
65
norm += popsift::shuffle_down ( norm, 1 );
80
- if ( threadIdx.x == 0 ) {
81
- // norm = __fsqrt_rn( norm );
82
- // norm = __fdividef( 512.0f, norm );
83
- norm = __frsqrt_rn ( norm ); // inverse square root
84
- norm = scalbnf ( norm, d_consts.norm_multi );
85
- }
86
- #else // not HAVE_NORMF
87
- float norm;
88
66
89
- norm = descr.x * descr.x
90
- + descr.y * descr.y
91
- + descr.z * descr.z
92
- + descr.w * descr.w ;
93
- norm += popsift::shuffle_down ( norm, 16 );
94
- norm += popsift::shuffle_down ( norm, 8 );
95
- norm += popsift::shuffle_down ( norm, 4 );
96
- norm += popsift::shuffle_down ( norm, 2 );
97
- norm += popsift::shuffle_down ( norm, 1 );
98
67
if ( threadIdx.x == 0 ) {
99
- norm = __fsqrt_rn ( norm );
68
+ // compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0
69
+ norm = __frsqrt_rn ( norm );
100
70
}
71
+
72
+ // spread the inverted norm from thread 0 to all threads in the warp
101
73
norm = popsift::shuffle ( norm, 0 );
102
74
103
- descr.x = min ( descr.x , 0 .2f *norm );
104
- descr.y = min ( descr.y , 0 .2f *norm );
105
- descr.z = min ( descr.z , 0 .2f *norm );
106
- descr.w = min ( descr.w , 0 .2f *norm );
75
+ // quasi-normalize all 128 floats
76
+ descr.x = min ( descr.x *norm, 0 .2f );
77
+ descr.y = min ( descr.y *norm, 0 .2f );
78
+ descr.z = min ( descr.z *norm, 0 .2f );
79
+ descr.w = min ( descr.w *norm, 0 .2f );
107
80
81
+ // Repeat the procedure, but also add a multiplier. E.g., if the user wants to
82
+ // descriptors as bytes rather than floats, multiply by 256 - or even by 512
83
+ // for better accuracy, which is OK because a point cannot be a keypoint if more
84
+ // than half of its gradient is in a single direction.
108
85
norm = descr.x * descr.x
109
86
+ descr.y * descr.y
110
87
+ descr.z * descr.z
@@ -114,13 +91,12 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
114
91
norm += popsift::shuffle_down ( norm, 4 );
115
92
norm += popsift::shuffle_down ( norm, 2 );
116
93
norm += popsift::shuffle_down ( norm, 1 );
94
+
117
95
if ( threadIdx.x == 0 ) {
118
- // norm = __fsqrt_rn( norm );
119
- // norm = __fdividef( 512.0f, norm );
120
96
norm = __frsqrt_rn ( norm ); // inverse square root
121
97
norm = scalbnf ( norm, d_consts.norm_multi );
122
98
}
123
- # endif // HAVE_NORMF
99
+
124
100
norm = popsift::shuffle ( norm, 0 );
125
101
126
102
descr.x = descr.x * norm;
0 commit comments