handcode the loops so LLVM has to chew less IR

the8472 · the8472 · commit 5850f36dfed1 · 2023-10-07T21:19:25.000+02:00
diff --git a/library/core/src/slice/cmp.rs b/library/core/src/slice/cmp.rs
@@ -55,54 +55,57 @@ impl<A, B> SlicePartialEq<B> for [A]
 where
     A: PartialEq<B>,
 {
-    #[inline]
     default fn equal(&self, other: &[B]) -> bool {
         if self.len() != other.len() {
             return false;
         }
 
-        // at least 8 items for unrolling to make sense (4 peeled + 4+ unrolled)
-        if self.len() < 8 {
-            return eq_small(self, other);
+        if self.len() == 0 {
+            return true;
         }
 
-        eq_unroll(self, other)
-    }
-}
+        // ZSTs have no identity and slices don't guarantee which addresses-to-ZSTs they produce
+        // so we only need to compare them once to determine the behavior of the PartialEq impl
+        if const { mem::size_of::<A>() == 0 && mem::size_of::<B>() == 0 } {
+            // zero-length slices are always equal
+            // SAFETY: A and B are ZSTs so it's ok to conjure them out of thin air
+            return unsafe { mem::zeroed::<A>() == mem::zeroed::<B>() };
+        }
 
-#[inline]
-fn eq_small<A, B>(a: &[A], b: &[B]) -> bool
-where
-    A: PartialEq<B>,
-{
-    a.iter().zip(b).all(|(a, b)| a == b)
-}
+        const UNROLL: usize = 4;
+        let mut i = 0;
+        let mut is_eq = true;
+
+        let a = self.as_ptr();
+        let b = other.as_ptr();
+        let len = self.len();
+
+        // compare items 1 by 1 in case comparisons are expensive. at least one item, then
+        // until the remainder is a multiple of UNROLL
+        loop {
+            // SAFETY: slices are of the same length and loop conditions ensure indexes are in bounds
+            unsafe {
+                is_eq = is_eq & PartialEq::eq(&*a.add(i), &*b.add(i));
+                i = i.unchecked_add(1);
+            }
 
-fn eq_unroll<A, B>(a: &[A], b: &[B]) -> bool
-where
-    A: PartialEq<B>,
-{
-    let (mut chunks_a, residual_a) = a.as_chunks::<4>();
-    let (mut chunks_b, residual_b) = b.as_chunks::<4>();
-    let peeled_a = chunks_a.take_first().unwrap();
-    let peeled_b = chunks_b.take_first().unwrap();
-
-    // peel the first chunk and do a short-circuiting comparison to bail early on mismatches
-    // in case comparisons are expensive
-    let mut result = eq_small(peeled_a, peeled_b);
-
-    // then check the residual, another chance to bail early
-    result = result && eq_small(residual_a, residual_b);
-
-    // iter.all short-circuits which means the backend can't unroll the loop due to early exits.
-    // So we unroll it manually.
-    result = result
-        && chunks_a
-            .iter()
-            .zip(chunks_b)
-            .all(|(a, b)| (a[0] == b[0]) & (a[1] == b[1]) & (a[2] == b[2]) & (a[3] == b[3]));
-
-    result
+            if !is_eq || i == len || (len - i) % UNROLL == 0 {
+                break;
+            }
+        }
+        while is_eq && i + UNROLL <= len {
+            // SAFETY: slices are of the same length and loop conditions ensure indexes are in bounds
+            unsafe {
+                is_eq = is_eq & PartialEq::eq(&*a.add(i), &*b.add(i));
+                is_eq = is_eq & PartialEq::eq(&*a.add(i + 1), &*b.add(i + 1));
+                is_eq = is_eq & PartialEq::eq(&*a.add(i + 2), &*b.add(i + 2));
+                is_eq = is_eq & PartialEq::eq(&*a.add(i + 3), &*b.add(i + 3));
+                i = i.unchecked_add(UNROLL);
+            }
+        }
+
+        is_eq
+    }
 }
 
 // When each element can be compared byte-wise, we can compare all the bytes