Skip to content

Commit f297afa

Browse files
committed
Flip scanning direction of stable sort
Memory pre-fetching prefers forward scanning vs backwards scanning, and the code-gen is usually better. For the most sensitive types such as integers, these are planned to be merged bidirectionally at once. So there is no benefit in scanning backwards. The largest perf gains are seen for full ascending and descending inputs, which see 1.5x speedups. Random inputs benefit too, and some patterns can loose out, but these losses are minimal.
1 parent a3065a1 commit f297afa

File tree

1 file changed

+67
-45
lines changed

1 file changed

+67
-45
lines changed

library/core/src/slice/sort.rs

+67-45
Original file line numberDiff line numberDiff line change
@@ -1196,52 +1196,37 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
11961196

11971197
let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);
11981198

1199-
// In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
1200-
// strange decision, but consider the fact that merges more often go in the opposite direction
1201-
// (forwards). According to benchmarks, merging forwards is slightly faster than merging
1202-
// backwards. To conclude, identifying runs by traversing backwards improves performance.
1203-
let mut end = len;
1204-
while end > 0 {
1205-
// Find the next natural run, and reverse it if it's strictly descending.
1206-
let mut start = end - 1;
1207-
if start > 0 {
1208-
start -= 1;
1209-
1210-
// SAFETY: The v.get_unchecked must be fed with correct inbound indicies.
1211-
unsafe {
1212-
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
1213-
while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
1214-
start -= 1;
1215-
}
1216-
v[start..end].reverse();
1217-
} else {
1218-
while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
1219-
{
1220-
start -= 1;
1221-
}
1222-
}
1223-
}
1199+
let mut end = 0;
1200+
let mut start = 0;
1201+
1202+
// Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the
1203+
// code-gen is usually better. For the most sensitive types such as integers, these are merged
1204+
// bidirectionally at once. So there is no benefit in scanning backwards.
1205+
while end < len {
1206+
let (streak_end, was_reversed) = find_streak(&v[start..], is_less);
1207+
end += streak_end;
1208+
if was_reversed {
1209+
v[start..end].reverse();
12241210
}
12251211

12261212
// Insert some more elements into the run if it's too short. Insertion sort is faster than
12271213
// merge sort on short sequences, so this significantly improves performance.
1228-
start = provide_sorted_batch(v, start, end, is_less);
1214+
end = provide_sorted_batch(v, start, end, is_less);
12291215

12301216
// Push this run onto the stack.
12311217
runs.push(TimSortRun { start, len: end - start });
1232-
end = start;
1218+
start = end;
12331219

12341220
// Merge some pairs of adjacent runs to satisfy the invariants.
1235-
while let Some(r) = collapse(runs.as_slice()) {
1236-
let left = runs[r + 1];
1237-
let right = runs[r];
1238-
// SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and
1239-
// neither side may be on length 0.
1221+
while let Some(r) = collapse(runs.as_slice(), len) {
1222+
let left = runs[r];
1223+
let right = runs[r + 1];
1224+
let merge_slice = &mut v[left.start..right.start + right.len];
12401225
unsafe {
1241-
merge(&mut v[left.start..right.start + right.len], left.len, buf_ptr, is_less);
1226+
merge(merge_slice, left.len, buf_ptr, is_less);
12421227
}
1243-
runs[r] = TimSortRun { start: left.start, len: left.len + right.len };
1244-
runs.remove(r + 1);
1228+
runs[r + 1] = TimSortRun { start: left.start, len: left.len + right.len };
1229+
runs.remove(r);
12451230
}
12461231
}
12471232

@@ -1263,10 +1248,10 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
12631248
// run starts at index 0, it will always demand a merge operation until the stack is fully
12641249
// collapsed, in order to complete the sort.
12651250
#[inline]
1266-
fn collapse(runs: &[TimSortRun]) -> Option<usize> {
1251+
fn collapse(runs: &[TimSortRun], stop: usize) -> Option<usize> {
12671252
let n = runs.len();
12681253
if n >= 2
1269-
&& (runs[n - 1].start == 0
1254+
&& (runs[n - 1].start + runs[n - 1].len == stop
12701255
|| runs[n - 2].len <= runs[n - 1].len
12711256
|| (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
12721257
|| (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
@@ -1454,33 +1439,70 @@ pub struct TimSortRun {
14541439
start: usize,
14551440
}
14561441

1457-
/// Takes a range as denoted by start and end, that is already sorted and extends it to the left if
1442+
/// Takes a range as denoted by start and end, that is already sorted and extends it to the right if
14581443
/// necessary with sorts optimized for smaller ranges such as insertion sort.
14591444
#[cfg(not(no_global_oom_handling))]
1460-
fn provide_sorted_batch<T, F>(v: &mut [T], mut start: usize, end: usize, is_less: &mut F) -> usize
1445+
fn provide_sorted_batch<T, F>(v: &mut [T], start: usize, mut end: usize, is_less: &mut F) -> usize
14611446
where
14621447
F: FnMut(&T, &T) -> bool,
14631448
{
1464-
debug_assert!(end > start);
1449+
let len = v.len();
1450+
assert!(end >= start && end <= len);
14651451

14661452
// This value is a balance between least comparisons and best performance, as
14671453
// influenced by for example cache locality.
14681454
const MIN_INSERTION_RUN: usize = 10;
14691455

14701456
// Insert some more elements into the run if it's too short. Insertion sort is faster than
14711457
// merge sort on short sequences, so this significantly improves performance.
1472-
let start_found = start;
14731458
let start_end_diff = end - start;
14741459

1475-
if start_end_diff < MIN_INSERTION_RUN && start != 0 {
1460+
if start_end_diff < MIN_INSERTION_RUN && end < len {
14761461
// v[start_found..end] are elements that are already sorted in the input. We want to extend
14771462
// the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is
14781463
// more efficient that trying to push those already sorted elements to the left.
1464+
end = cmp::min(start + MIN_INSERTION_RUN, len);
1465+
let presorted_start = cmp::max(start_end_diff, 1);
14791466

1480-
start = if end >= MIN_INSERTION_RUN { end - MIN_INSERTION_RUN } else { 0 };
1467+
insertion_sort_shift_left(&mut v[start..end], presorted_start, is_less);
1468+
}
14811469

1482-
insertion_sort_shift_right(&mut v[start..end], start_found - start, is_less);
1470+
end
1471+
}
1472+
1473+
/// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first
1474+
/// value that is not part of said streak, and a bool denoting wether the streak was reversed.
1475+
/// Streaks can be increasing or decreasing.
1476+
fn find_streak<T, F>(v: &[T], is_less: &mut F) -> (usize, bool)
1477+
where
1478+
F: FnMut(&T, &T) -> bool,
1479+
{
1480+
let len = v.len();
1481+
1482+
if len < 2 {
1483+
return (len, false);
14831484
}
14841485

1485-
start
1486+
let mut end = 2;
1487+
1488+
// SAFETY: See below specific.
1489+
unsafe {
1490+
// SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
1491+
let assume_reverse = is_less(v.get_unchecked(1), v.get_unchecked(0));
1492+
1493+
// SAFETY: We know end >= 2 and check end < len.
1494+
// From that follows that accessing v at end and end - 1 is safe.
1495+
if assume_reverse {
1496+
while end < len && is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
1497+
end += 1;
1498+
}
1499+
1500+
(end, true)
1501+
} else {
1502+
while end < len && !is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
1503+
end += 1;
1504+
}
1505+
(end, false)
1506+
}
1507+
}
14861508
}

0 commit comments

Comments
 (0)