Skip to content

Commit 3d4a848

Browse files
committed
x86_64 SSE2 fast-path for str.contains(&str) and short needles
Based on Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0] The two-way algorithm is Big-O efficient but it needs to preprocess the needle to find a "criticla factorization" of it. This additional work is significant for short needles. Additionally it mostly advances needle.len() bytes at a time. The SIMD-based approach used here on the other hand can advance based on its vector width, which can exceed the needle length. Except for pathological cases, but due to being limited to small needles the worst case blowup is also small. benchmarks taken on a Zen2: ``` 16CGU, OLD: test str::bench_contains_short_short ... bench: 27 ns/iter (+/- 1) test str::bench_contains_short_long ... bench: 667 ns/iter (+/- 29) test str::bench_contains_bad_naive ... bench: 131 ns/iter (+/- 2) test str::bench_contains_bad_simd ... bench: 130 ns/iter (+/- 2) test str::bench_contains_equal ... bench: 148 ns/iter (+/- 4) 16CGU, NEW: test str::bench_contains_short_short ... bench: 8 ns/iter (+/- 0) test str::bench_contains_short_long ... bench: 135 ns/iter (+/- 4) test str::bench_contains_bad_naive ... bench: 130 ns/iter (+/- 2) test str::bench_contains_bad_simd ... bench: 292 ns/iter (+/- 1) test str::bench_contains_equal ... bench: 3 ns/iter (+/- 0) 1CGU, OLD: test str::bench_contains_short_short ... bench: 30 ns/iter (+/- 0) test str::bench_contains_short_long ... bench: 713 ns/iter (+/- 17) test str::bench_contains_bad_naive ... bench: 131 ns/iter (+/- 3) test str::bench_contains_bad_simd ... bench: 130 ns/iter (+/- 3) test str::bench_contains_equal ... bench: 148 ns/iter (+/- 6) 1CGU, NEW: test str::bench_contains_short_short ... bench: 10 ns/iter (+/- 0) test str::bench_contains_short_long ... bench: 111 ns/iter (+/- 0) test str::bench_contains_bad_naive ... bench: 135 ns/iter (+/- 3) test str::bench_contains_bad_simd ... bench: 274 ns/iter (+/- 2) test str::bench_contains_equal ... bench: 4 ns/iter (+/- 0) ``` [0] http://0x80.pl/articles/simd-strfind.html#sse-avx2
1 parent 467b299 commit 3d4a848

File tree

1 file changed

+102
-0
lines changed

1 file changed

+102
-0
lines changed

library/core/src/str/pattern.rs

+102
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
)]
4040

4141
use crate::cmp;
42+
use crate::cmp::Ordering;
4243
use crate::fmt;
4344
use crate::slice::memchr;
4445

@@ -946,6 +947,27 @@ impl<'a, 'b> Pattern<'a> for &'b str {
946947
haystack.as_bytes().starts_with(self.as_bytes())
947948
}
948949

950+
/// Checks whether the pattern matches anywhere in the haystack
951+
#[inline]
952+
fn is_contained_in(self, haystack: &'a str) -> bool {
953+
if self.len() == 0 {
954+
return true;
955+
}
956+
957+
match self.len().cmp(&haystack.len()) {
958+
Ordering::Less => {
959+
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
960+
if self.as_bytes().len() <= 8 {
961+
return simd_contains(self, haystack);
962+
}
963+
964+
self.into_searcher(haystack).next_match().is_some()
965+
}
966+
Ordering::Equal => self == haystack,
967+
Ordering::Greater => false,
968+
}
969+
}
970+
949971
/// Removes the pattern from the front of haystack, if it matches.
950972
#[inline]
951973
fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
@@ -1684,3 +1706,83 @@ impl TwoWayStrategy for RejectAndMatch {
16841706
SearchStep::Match(a, b)
16851707
}
16861708
}
1709+
1710+
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
1711+
#[inline]
1712+
fn simd_contains(needle: &str, haystack: &str) -> bool {
1713+
let needle = needle.as_bytes();
1714+
let haystack = haystack.as_bytes();
1715+
1716+
if needle.len() == 1 {
1717+
return haystack.contains(&needle[0]);
1718+
}
1719+
1720+
const CHUNK: usize = 16;
1721+
1722+
// do a naive search if if the haystack is too small to fit
1723+
if haystack.len() < CHUNK + needle.len() - 1 {
1724+
return haystack.windows(needle.len()).any(|c| c == needle);
1725+
}
1726+
1727+
use crate::arch::x86_64::{
1728+
__m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_set1_epi8,
1729+
};
1730+
1731+
// SAFETY: no preconditions other than sse2 being available
1732+
let first: __m128i = unsafe { _mm_set1_epi8(needle[0] as i8) };
1733+
// SAFETY: no preconditions other than sse2 being available
1734+
let last: __m128i = unsafe { _mm_set1_epi8(*needle.last().unwrap() as i8) };
1735+
1736+
let check_mask = #[cold]
1737+
|idx, mut mask: u32| -> bool {
1738+
while mask != 0 {
1739+
let trailing = mask.trailing_zeros();
1740+
let offset = idx + trailing as usize + 1;
1741+
let sub = &haystack[offset..][..needle.len() - 2];
1742+
let trimmed_needle = &needle[1..needle.len() - 1];
1743+
1744+
if sub == trimmed_needle {
1745+
return true;
1746+
}
1747+
mask &= !(1 << trailing);
1748+
}
1749+
return false;
1750+
};
1751+
1752+
let test_chunk = |i| -> bool {
1753+
// SAFETY: this requires at least CHUNK bytes being readable at offset i
1754+
// that is ensured by the loop ranges (see comments below)
1755+
let a: __m128i = unsafe { _mm_loadu_si128(haystack.as_ptr().add(i) as *const _) };
1756+
let b: __m128i =
1757+
// SAFETY: this requires CHUNK + needle.len() - 1 bytes being readable at offset i
1758+
unsafe { _mm_loadu_si128(haystack.as_ptr().add(i + needle.len() - 1) as *const _) };
1759+
1760+
// SAFETY: no preconditions other than sse2 being available
1761+
let eq_first: __m128i = unsafe { _mm_cmpeq_epi8(first, a) };
1762+
// SAFETY: no preconditions other than sse2 being available
1763+
let eq_last: __m128i = unsafe { _mm_cmpeq_epi8(last, b) };
1764+
1765+
// SAFETY: no preconditions other than sse2 being available
1766+
let mask: u32 = unsafe { _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last)) } as u32;
1767+
1768+
if mask != 0 {
1769+
return check_mask(i, mask);
1770+
}
1771+
return false;
1772+
};
1773+
1774+
let mut i = 0;
1775+
let mut result = false;
1776+
while !result && i + CHUNK + needle.len() <= haystack.len() {
1777+
result |= test_chunk(i);
1778+
i += CHUNK;
1779+
}
1780+
1781+
// process the tail that didn't fit into CHUNK-sized steps
1782+
// this simply repeats the same procedure but as right-aligned chunk instead
1783+
// of a left-aligned one. The last byte must be exactly flush with the string end so
1784+
// we don't miss a single byte or read out of bounds.
1785+
result |= test_chunk(haystack.len() + 1 - needle.len() - CHUNK);
1786+
1787+
return result;
1788+
}

0 commit comments

Comments
 (0)