@@ -1196,52 +1196,37 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
1196
1196
1197
1197
let mut runs = RunVec :: new ( run_alloc_fn, run_dealloc_fn) ;
1198
1198
1199
- // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
1200
- // strange decision, but consider the fact that merges more often go in the opposite direction
1201
- // (forwards). According to benchmarks, merging forwards is slightly faster than merging
1202
- // backwards. To conclude, identifying runs by traversing backwards improves performance.
1203
- let mut end = len;
1204
- while end > 0 {
1205
- // Find the next natural run, and reverse it if it's strictly descending.
1206
- let mut start = end - 1 ;
1207
- if start > 0 {
1208
- start -= 1 ;
1209
-
1210
- // SAFETY: The v.get_unchecked must be fed with correct inbound indicies.
1211
- unsafe {
1212
- if is_less ( v. get_unchecked ( start + 1 ) , v. get_unchecked ( start) ) {
1213
- while start > 0 && is_less ( v. get_unchecked ( start) , v. get_unchecked ( start - 1 ) ) {
1214
- start -= 1 ;
1215
- }
1216
- v[ start..end] . reverse ( ) ;
1217
- } else {
1218
- while start > 0 && !is_less ( v. get_unchecked ( start) , v. get_unchecked ( start - 1 ) )
1219
- {
1220
- start -= 1 ;
1221
- }
1222
- }
1223
- }
1199
+ let mut end = 0 ;
1200
+ let mut start = 0 ;
1201
+
1202
+ // Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the
1203
+ // code-gen is usually better. For the most sensitive types such as integers, these are merged
1204
+ // bidirectionally at once. So there is no benefit in scanning backwards.
1205
+ while end < len {
1206
+ let ( streak_end, was_reversed) = find_streak ( & v[ start..] , is_less) ;
1207
+ end += streak_end;
1208
+ if was_reversed {
1209
+ v[ start..end] . reverse ( ) ;
1224
1210
}
1225
1211
1226
1212
// Insert some more elements into the run if it's too short. Insertion sort is faster than
1227
1213
// merge sort on short sequences, so this significantly improves performance.
1228
- start = provide_sorted_batch ( v, start, end, is_less) ;
1214
+ end = provide_sorted_batch ( v, start, end, is_less) ;
1229
1215
1230
1216
// Push this run onto the stack.
1231
1217
runs. push ( TimSortRun { start, len : end - start } ) ;
1232
- end = start ;
1218
+ start = end ;
1233
1219
1234
1220
// Merge some pairs of adjacent runs to satisfy the invariants.
1235
- while let Some ( r) = collapse ( runs. as_slice ( ) ) {
1236
- let left = runs[ r + 1 ] ;
1237
- let right = runs[ r] ;
1238
- // SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and
1239
- // neither side may be on length 0.
1221
+ while let Some ( r) = collapse ( runs. as_slice ( ) , len) {
1222
+ let left = runs[ r] ;
1223
+ let right = runs[ r + 1 ] ;
1224
+ let merge_slice = & mut v[ left. start ..right. start + right. len ] ;
1240
1225
unsafe {
1241
- merge ( & mut v [ left . start ..right . start + right . len ] , left. len , buf_ptr, is_less) ;
1226
+ merge ( merge_slice , left. len , buf_ptr, is_less) ;
1242
1227
}
1243
- runs[ r] = TimSortRun { start : left. start , len : left. len + right. len } ;
1244
- runs. remove ( r + 1 ) ;
1228
+ runs[ r + 1 ] = TimSortRun { start : left. start , len : left. len + right. len } ;
1229
+ runs. remove ( r) ;
1245
1230
}
1246
1231
}
1247
1232
@@ -1263,10 +1248,10 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
1263
1248
// run starts at index 0, it will always demand a merge operation until the stack is fully
1264
1249
// collapsed, in order to complete the sort.
1265
1250
#[ inline]
1266
- fn collapse ( runs : & [ TimSortRun ] ) -> Option < usize > {
1251
+ fn collapse ( runs : & [ TimSortRun ] , stop : usize ) -> Option < usize > {
1267
1252
let n = runs. len ( ) ;
1268
1253
if n >= 2
1269
- && ( runs[ n - 1 ] . start == 0
1254
+ && ( runs[ n - 1 ] . start + runs [ n - 1 ] . len == stop
1270
1255
|| runs[ n - 2 ] . len <= runs[ n - 1 ] . len
1271
1256
|| ( n >= 3 && runs[ n - 3 ] . len <= runs[ n - 2 ] . len + runs[ n - 1 ] . len )
1272
1257
|| ( n >= 4 && runs[ n - 4 ] . len <= runs[ n - 3 ] . len + runs[ n - 2 ] . len ) )
@@ -1454,33 +1439,70 @@ pub struct TimSortRun {
1454
1439
start : usize ,
1455
1440
}
1456
1441
1457
- /// Takes a range as denoted by start and end, that is already sorted and extends it to the left if
1442
+ /// Takes a range as denoted by start and end, that is already sorted and extends it to the right if
1458
1443
/// necessary with sorts optimized for smaller ranges such as insertion sort.
1459
1444
#[ cfg( not( no_global_oom_handling) ) ]
1460
- fn provide_sorted_batch < T , F > ( v : & mut [ T ] , mut start : usize , end : usize , is_less : & mut F ) -> usize
1445
+ fn provide_sorted_batch < T , F > ( v : & mut [ T ] , start : usize , mut end : usize , is_less : & mut F ) -> usize
1461
1446
where
1462
1447
F : FnMut ( & T , & T ) -> bool ,
1463
1448
{
1464
- debug_assert ! ( end > start) ;
1449
+ let len = v. len ( ) ;
1450
+ assert ! ( end >= start && end <= len) ;
1465
1451
1466
1452
// This value is a balance between least comparisons and best performance, as
1467
1453
// influenced by for example cache locality.
1468
1454
const MIN_INSERTION_RUN : usize = 10 ;
1469
1455
1470
1456
// Insert some more elements into the run if it's too short. Insertion sort is faster than
1471
1457
// merge sort on short sequences, so this significantly improves performance.
1472
- let start_found = start;
1473
1458
let start_end_diff = end - start;
1474
1459
1475
- if start_end_diff < MIN_INSERTION_RUN && start != 0 {
1460
+ if start_end_diff < MIN_INSERTION_RUN && end < len {
1476
1461
// v[start_found..end] are elements that are already sorted in the input. We want to extend
1477
1462
// the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is
1478
1463
// more efficient that trying to push those already sorted elements to the left.
1464
+ end = cmp:: min ( start + MIN_INSERTION_RUN , len) ;
1465
+ let presorted_start = cmp:: max ( start_end_diff, 1 ) ;
1479
1466
1480
- start = if end >= MIN_INSERTION_RUN { end - MIN_INSERTION_RUN } else { 0 } ;
1467
+ insertion_sort_shift_left ( & mut v[ start..end] , presorted_start, is_less) ;
1468
+ }
1481
1469
1482
- insertion_sort_shift_right ( & mut v[ start..end] , start_found - start, is_less) ;
1470
+ end
1471
+ }
1472
+
1473
+ /// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first
1474
+ /// value that is not part of said streak, and a bool denoting wether the streak was reversed.
1475
+ /// Streaks can be increasing or decreasing.
1476
+ fn find_streak < T , F > ( v : & [ T ] , is_less : & mut F ) -> ( usize , bool )
1477
+ where
1478
+ F : FnMut ( & T , & T ) -> bool ,
1479
+ {
1480
+ let len = v. len ( ) ;
1481
+
1482
+ if len < 2 {
1483
+ return ( len, false ) ;
1483
1484
}
1484
1485
1485
- start
1486
+ let mut end = 2 ;
1487
+
1488
+ // SAFETY: See below specific.
1489
+ unsafe {
1490
+ // SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
1491
+ let assume_reverse = is_less ( v. get_unchecked ( 1 ) , v. get_unchecked ( 0 ) ) ;
1492
+
1493
+ // SAFETY: We know end >= 2 and check end < len.
1494
+ // From that follows that accessing v at end and end - 1 is safe.
1495
+ if assume_reverse {
1496
+ while end < len && is_less ( v. get_unchecked ( end) , v. get_unchecked ( end - 1 ) ) {
1497
+ end += 1 ;
1498
+ }
1499
+
1500
+ ( end, true )
1501
+ } else {
1502
+ while end < len && !is_less ( v. get_unchecked ( end) , v. get_unchecked ( end - 1 ) ) {
1503
+ end += 1 ;
1504
+ }
1505
+ ( end, false )
1506
+ }
1507
+ }
1486
1508
}
0 commit comments