Merge pull request #627 from xtensor-stack/feature/float-to-uint-conversion

serge-sans-paille · web-flow · commit 54aa8e72bc7c · 2021-11-20T22:03:26.000Z
Provide some conversion operators for float -&gt; uint32
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -491,6 +491,7 @@ namespace xsimd
                 return get_half_complex_d<1>(self.real(), self.imag());
             }
         }
+
         // convert
         namespace detail
         {
@@ -499,11 +500,28 @@ namespace xsimd
             {
                 return _mm256_cvtepi32_ps(self);
             }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>)
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
+                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
+
+                __m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self                             */
+                __m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v                                                 */
+                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
             template <class A>
             inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>)
             {
                 return _mm256_cvttps_epi32(self);
             }
+
         }
 
         // div
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -242,6 +242,26 @@ namespace xsimd
             __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
             return _mm256_blend_pd(tmp0, tmp1, 10);
         }
+        // convert
+        namespace detail
+        {
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>)
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
+                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
+
+                __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
+                __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
+                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
+        }
 
         // eq
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -625,6 +625,34 @@ namespace xsimd
             return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
         }
 
+        // convert
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>)
+            {
+                return _mm512_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>)
+            {
+                return _mm512_cvttps_epi32(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>)
+            {
+                return _mm512_cvtepu32_ps(self);
+            }
+
+            template <class A>
+            batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx512f>)
+            {
+                return _mm512_cvttps_epu32(self);
+            }
+        }
+
         namespace detail
         {
             // complex_low
@@ -656,21 +684,6 @@ namespace xsimd
             }
         }
 
-        // convert
-        namespace detail
-        {
-            template <class A>
-            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>)
-            {
-                return _mm512_cvtepi32_ps(self);
-            }
-            template <class A>
-            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>)
-            {
-                return _mm512_cvttps_epi32(self);
-            }
-        }
-
         // div
         template <class A>
         inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>)
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -483,11 +483,28 @@ namespace xsimd
             {
                 return _mm_cvtepi32_ps(self);
             }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>)
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m128i msk_lo = _mm_set1_epi32(0xFFFF);
+                __m128 cnst65536f = _mm_set1_ps(65536.0f);
+
+                __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
+                __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
+                __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
             template <class A>
             inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>)
             {
                 return _mm_cvttps_epi32(self);
             }
+
         }
 
         // eq

Original file line number	Diff line number	Diff line change
`@@ -491,6 +491,7 @@ namespace xsimd`
`491`	`491`	`return get_half_complex_d<1>(self.real(), self.imag());`
`492`	`492`	`}`
`493`	`493`	`}`
	`494`	`+`
`494`	`495`	`// convert`
`495`	`496`	`namespace detail`
`496`	`497`	`{`
`@@ -499,11 +500,28 @@ namespace xsimd`
`499`	`500`	`{`
`500`	`501`	`return _mm256_cvtepi32_ps(self);`
`501`	`502`	`}`
	`503`	`+`
	`504`	`+ template <class A>`
	`505`	`+ inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>)`
	`506`	`+ {`
	`507`	`+ // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse`
	`508`	`+ __m256i msk_lo = _mm256_set1_epi32(0xFFFF);`
	`509`	`+ __m256 cnst65536f = _mm256_set1_ps(65536.0f);`
	`510`	`+`
	`511`	`+ __m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self */`
	`512`	`+ __m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v */`
	`513`	`+ __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */`
	`514`	`+ __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */`
	`515`	`+ v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */`
	`516`	`+ return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */`
	`517`	`+ }`
	`518`	`+`
`502`	`519`	`template <class A>`
`503`	`520`	`inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>)`
`504`	`521`	`{`
`505`	`522`	`return _mm256_cvttps_epi32(self);`
`506`	`523`	`}`
	`524`	`+`
`507`	`525`	`}`
`508`	`526`
`509`	`527`	`// div`
Original file line number	Diff line number	Diff line change
`@@ -483,11 +483,28 @@ namespace xsimd`
`483`	`483`	`{`
`484`	`484`	`return _mm_cvtepi32_ps(self);`
`485`	`485`	`}`
	`486`	`+`
	`487`	`+ template <class A>`
	`488`	`+ inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>)`
	`489`	`+ {`
	`490`	`+ // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse`
	`491`	`+ __m128i msk_lo = _mm_set1_epi32(0xFFFF);`
	`492`	`+ __m128 cnst65536f = _mm_set1_ps(65536.0f);`
	`493`	`+`
	`494`	`+ __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self */`
	`495`	`+ __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v */`
	`496`	`+ __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding */`
	`497`	`+ __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding */`
	`498`	`+ v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding */`
	`499`	`+ return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */`
	`500`	`+ }`
	`501`	`+`
`486`	`502`	`template <class A>`
`487`	`503`	`inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>)`
`488`	`504`	`{`
`489`	`505`	`return _mm_cvttps_epi32(self);`
`490`	`506`	`}`
	`507`	`+`
`491`	`508`	`}`
`492`	`509`
`493`	`510`	`// eq`