improve code generated for AVX2 signed integer comparisons

tomjnixon · tomjnixon · commit 7a7a51fa0655 · 2021-11-16T00:34:44.000Z
previously AVX2 gt was implemented, but lt fell back to the AVX lt,
which is implemented with SSE2 instructions

generic_logical has the following mappings:

 - gt -&gt; lt
 - ge -&gt; le
 - le -&gt; lt || eq

so it's best to just implement eq, lt, and le if it's available
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -569,44 +569,6 @@ namespace xsimd
             return _mm256_floor_pd(self);
         }
 
-        // ge
-        template <class A>
-        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>)
-        {
-            return _mm256_cmp_ps(self, other, _CMP_GE_OQ);
-        }
-        template <class A>
-        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>)
-        {
-            return _mm256_cmp_pd(self, other, _CMP_GE_OQ);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>)
-        {
-            return detail::fwd_to_sse([](__m128i s, __m128i o)
-                                      { return ge(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
-                                      self, other);
-        }
-
-        // gt
-        template <class A>
-        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>)
-        {
-            return _mm256_cmp_ps(self, other, _CMP_GT_OQ);
-        }
-        template <class A>
-        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>)
-        {
-            return _mm256_cmp_pd(self, other, _CMP_GT_OQ);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>)
-        {
-            return detail::fwd_to_sse([](__m128i s, __m128i o)
-                                      { return gt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
-                                      self, other);
-        }
-
         // hadd
         template <class A>
         inline float hadd(batch<float, A> const& rhs, requires_arch<avx>)
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -262,29 +262,29 @@ namespace xsimd
             }
         }
 
-        // gt
+        // lt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>)
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>)
         {
             if (std::is_signed<T>::value)
             {
                 switch (sizeof(T))
                 {
                 case 1:
-                    return _mm256_cmpgt_epi8(self, other);
+                    return _mm256_cmpgt_epi8(other, self);
                 case 2:
-                    return _mm256_cmpgt_epi16(self, other);
+                    return _mm256_cmpgt_epi16(other, self);
                 case 4:
-                    return _mm256_cmpgt_epi32(self, other);
+                    return _mm256_cmpgt_epi32(other, self);
                 case 8:
-                    return _mm256_cmpgt_epi64(self, other);
+                    return _mm256_cmpgt_epi64(other, self);
                 default:
-                    return gt(self, other, avx {});
+                    return lt(self, other, avx {});
                 }
             }
             else
             {
-                return gt(self, other, avx {});
+                return lt(self, other, avx {});
             }
         }
 

Original file line number	Diff line number	Diff line change
`@@ -262,29 +262,29 @@ namespace xsimd`
`262`	`262`	`}`
`263`	`263`	`}`
`264`	`264`
`265`		`- // gt`
	`265`	`+ // lt`
`266`	`266`	`template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>`
`267`		`- inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>)`
	`267`	`+ inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>)`
`268`	`268`	`{`
`269`	`269`	`if (std::is_signed<T>::value)`
`270`	`270`	`{`
`271`	`271`	`switch (sizeof(T))`
`272`	`272`	`{`
`273`	`273`	`case 1:`
`274`		`- return _mm256_cmpgt_epi8(self, other);`
	`274`	`+ return _mm256_cmpgt_epi8(other, self);`
`275`	`275`	`case 2:`
`276`		`- return _mm256_cmpgt_epi16(self, other);`
	`276`	`+ return _mm256_cmpgt_epi16(other, self);`
`277`	`277`	`case 4:`
`278`		`- return _mm256_cmpgt_epi32(self, other);`
	`278`	`+ return _mm256_cmpgt_epi32(other, self);`
`279`	`279`	`case 8:`
`280`		`- return _mm256_cmpgt_epi64(self, other);`
	`280`	`+ return _mm256_cmpgt_epi64(other, self);`
`281`	`281`	`default:`
`282`		`- return gt(self, other, avx {});`
	`282`	`+ return lt(self, other, avx {});`
`283`	`283`	`}`
`284`	`284`	`}`
`285`	`285`	`else`
`286`	`286`	`{`
`287`		`- return gt(self, other, avx {});`
	`287`	`+ return lt(self, other, avx {});`
`288`	`288`	`}`
`289`	`289`	`}`
`290`	`290`