42
42
} ,
43
43
solana_streamer:: sendmmsg:: { batch_send, SendPktsError } ,
44
44
std:: {
45
- collections:: { HashMap , HashSet } ,
45
+ collections:: { hash_map :: Entry , HashMap , HashSet } ,
46
46
iter:: Iterator ,
47
47
net:: { SocketAddr , UdpSocket } ,
48
48
sync:: {
59
59
const DEFER_REPAIR_THRESHOLD : Duration = Duration :: from_millis ( 200 ) ;
60
60
const DEFER_REPAIR_THRESHOLD_TICKS : u64 = DEFER_REPAIR_THRESHOLD . as_millis ( ) as u64 / MS_PER_TICK ;
61
61
62
+ // This is the amount of time we will wait for a repair request to be fulfilled
63
+ // before making another request. Value is based on reasonable upper bound of
64
+ // expected network delays in requesting repairs and receiving shreds.
65
+ const REPAIR_REQUEST_TIMEOUT_MS : u64 = 100 ;
66
+
62
67
// When requesting repair for a specific shred through the admin RPC, we will
63
68
// request up to NUM_PEERS_TO_SAMPLE_FOR_REPAIRS in the event a specific, valid
64
69
// target node is not provided. This number was chosen to provide reasonable
@@ -208,7 +213,7 @@ impl BestRepairsStats {
208
213
pub const MAX_REPAIR_LENGTH : usize = 512 ;
209
214
pub const MAX_REPAIR_PER_DUPLICATE : usize = 20 ;
210
215
pub const MAX_DUPLICATE_WAIT_MS : usize = 10_000 ;
211
- pub const REPAIR_MS : u64 = 100 ;
216
+ pub const REPAIR_MS : u64 = 1 ;
212
217
pub const MAX_ORPHANS : usize = 5 ;
213
218
pub const MAX_UNKNOWN_LAST_INDEX_REPAIRS : usize = 10 ;
214
219
pub const MAX_CLOSEST_COMPLETION_REPAIRS : usize = 100 ;
@@ -328,6 +333,8 @@ impl RepairService {
328
333
let mut last_stats = Instant :: now ( ) ;
329
334
let mut peers_cache = LruCache :: new ( REPAIR_PEERS_CACHE_CAPACITY ) ;
330
335
let mut popular_pruned_forks_requests = HashSet :: new ( ) ;
336
+ // Maps a repair that may still be outstanding to the timestamp it was requested.
337
+ let mut outstanding_repairs = HashMap :: new ( ) ;
331
338
332
339
while !exit. load ( Ordering :: Relaxed ) {
333
340
let mut set_root_elapsed;
@@ -399,11 +406,17 @@ impl RepairService {
399
406
) ;
400
407
add_votes_elapsed. stop ( ) ;
401
408
409
+ // Purge old entries. They've either completed or need to be retried.
410
+ outstanding_repairs. retain ( |_repair_request, time| {
411
+ timestamp ( ) . saturating_sub ( * time) < REPAIR_REQUEST_TIMEOUT_MS
412
+ } ) ;
413
+
402
414
let repairs = match repair_info. wen_restart_repair_slots . clone ( ) {
403
415
Some ( slots_to_repair) => Self :: generate_repairs_for_wen_restart (
404
416
blockstore,
405
417
MAX_REPAIR_LENGTH ,
406
418
& slots_to_repair. read ( ) . unwrap ( ) ,
419
+ & mut outstanding_repairs,
407
420
) ,
408
421
None => repair_weight. get_best_weighted_repairs (
409
422
blockstore,
@@ -415,6 +428,7 @@ impl RepairService {
415
428
MAX_CLOSEST_COMPLETION_REPAIRS ,
416
429
& mut repair_timing,
417
430
& mut best_repairs_stats,
431
+ & mut outstanding_repairs,
418
432
) ,
419
433
} ;
420
434
@@ -631,17 +645,33 @@ impl RepairService {
631
645
slot : Slot ,
632
646
slot_meta : & SlotMeta ,
633
647
max_repairs : usize ,
648
+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
634
649
) -> Vec < ShredRepairType > {
635
- Self :: generate_repairs_for_slot ( blockstore, slot, slot_meta, max_repairs, true )
650
+ Self :: generate_repairs_for_slot (
651
+ blockstore,
652
+ slot,
653
+ slot_meta,
654
+ max_repairs,
655
+ true ,
656
+ outstanding_repairs,
657
+ )
636
658
}
637
659
638
660
pub fn generate_repairs_for_slot_not_throttled_by_tick (
639
661
blockstore : & Blockstore ,
640
662
slot : Slot ,
641
663
slot_meta : & SlotMeta ,
642
664
max_repairs : usize ,
665
+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
643
666
) -> Vec < ShredRepairType > {
644
- Self :: generate_repairs_for_slot ( blockstore, slot, slot_meta, max_repairs, false )
667
+ Self :: generate_repairs_for_slot (
668
+ blockstore,
669
+ slot,
670
+ slot_meta,
671
+ max_repairs,
672
+ false ,
673
+ outstanding_repairs,
674
+ )
645
675
}
646
676
647
677
/// If this slot is missing shreds generate repairs
@@ -651,6 +681,7 @@ impl RepairService {
651
681
slot_meta : & SlotMeta ,
652
682
max_repairs : usize ,
653
683
throttle_requests_by_shred_tick : bool ,
684
+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
654
685
) -> Vec < ShredRepairType > {
655
686
let defer_repair_threshold_ticks = if throttle_requests_by_shred_tick {
656
687
DEFER_REPAIR_THRESHOLD_TICKS
@@ -680,7 +711,14 @@ impl RepairService {
680
711
}
681
712
}
682
713
}
683
- vec ! [ ShredRepairType :: HighestShred ( slot, slot_meta. received) ]
714
+
715
+ let repair_request = ShredRepairType :: HighestShred ( slot, slot_meta. received ) ;
716
+ if let Entry :: Vacant ( entry) = outstanding_repairs. entry ( repair_request) {
717
+ entry. insert ( timestamp ( ) ) ;
718
+ vec ! [ repair_request]
719
+ } else {
720
+ vec ! [ ]
721
+ }
684
722
} else {
685
723
blockstore
686
724
. find_missing_data_indexes (
@@ -692,7 +730,15 @@ impl RepairService {
692
730
max_repairs,
693
731
)
694
732
. into_iter ( )
695
- . map ( |i| ShredRepairType :: Shred ( slot, i) )
733
+ . filter_map ( |i| {
734
+ let repair_request = ShredRepairType :: Shred ( slot, i) ;
735
+ if let Entry :: Vacant ( entry) = outstanding_repairs. entry ( repair_request) {
736
+ entry. insert ( timestamp ( ) ) ;
737
+ Some ( repair_request)
738
+ } else {
739
+ None
740
+ }
741
+ } )
696
742
. collect ( )
697
743
}
698
744
}
@@ -703,6 +749,7 @@ impl RepairService {
703
749
repairs : & mut Vec < ShredRepairType > ,
704
750
max_repairs : usize ,
705
751
slot : Slot ,
752
+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
706
753
) {
707
754
let mut pending_slots = vec ! [ slot] ;
708
755
while repairs. len ( ) < max_repairs && !pending_slots. is_empty ( ) {
@@ -713,6 +760,7 @@ impl RepairService {
713
760
slot,
714
761
& slot_meta,
715
762
max_repairs - repairs. len ( ) ,
763
+ outstanding_repairs,
716
764
) ;
717
765
repairs. extend ( new_repairs) ;
718
766
let next_slots = slot_meta. next_slots ;
@@ -727,6 +775,7 @@ impl RepairService {
727
775
blockstore : & Blockstore ,
728
776
max_repairs : usize ,
729
777
slots : & Vec < Slot > ,
778
+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
730
779
) -> Vec < ShredRepairType > {
731
780
let mut repairs: Vec < ShredRepairType > = Vec :: new ( ) ;
732
781
for slot in slots {
@@ -738,6 +787,7 @@ impl RepairService {
738
787
* slot,
739
788
& slot_meta,
740
789
max_repairs - repairs. len ( ) ,
790
+ outstanding_repairs,
741
791
) ;
742
792
repairs. extend ( new_repairs) ;
743
793
} else {
@@ -911,6 +961,7 @@ impl RepairService {
911
961
slot,
912
962
& meta,
913
963
max_repairs - repairs. len ( ) ,
964
+ & mut HashMap :: default ( ) ,
914
965
) ;
915
966
repairs. extend ( new_repairs) ;
916
967
}
@@ -933,6 +984,7 @@ impl RepairService {
933
984
slot,
934
985
& slot_meta,
935
986
MAX_REPAIR_PER_DUPLICATE ,
987
+ & mut HashMap :: default ( ) ,
936
988
) )
937
989
}
938
990
} else {
@@ -1163,6 +1215,7 @@ mod test {
1163
1215
MAX_CLOSEST_COMPLETION_REPAIRS ,
1164
1216
& mut RepairTiming :: default ( ) ,
1165
1217
& mut BestRepairsStats :: default ( ) ,
1218
+ & mut HashMap :: default ( ) ,
1166
1219
) ,
1167
1220
vec![
1168
1221
ShredRepairType :: Orphan ( 2 ) ,
@@ -1195,6 +1248,7 @@ mod test {
1195
1248
MAX_CLOSEST_COMPLETION_REPAIRS ,
1196
1249
& mut RepairTiming :: default ( ) ,
1197
1250
& mut BestRepairsStats :: default ( ) ,
1251
+ & mut HashMap :: default ( ) ,
1198
1252
) ,
1199
1253
vec![ ShredRepairType :: HighestShred ( 0 , 0 ) ]
1200
1254
) ;
@@ -1252,6 +1306,7 @@ mod test {
1252
1306
MAX_CLOSEST_COMPLETION_REPAIRS ,
1253
1307
& mut RepairTiming :: default ( ) ,
1254
1308
& mut BestRepairsStats :: default ( ) ,
1309
+ & mut HashMap :: default ( ) ,
1255
1310
) ,
1256
1311
expected
1257
1312
) ;
@@ -1267,6 +1322,7 @@ mod test {
1267
1322
MAX_CLOSEST_COMPLETION_REPAIRS ,
1268
1323
& mut RepairTiming :: default ( ) ,
1269
1324
& mut BestRepairsStats :: default ( ) ,
1325
+ & mut HashMap :: default ( ) ,
1270
1326
) [ ..] ,
1271
1327
expected[ 0 ..expected. len( ) - 2 ]
1272
1328
) ;
@@ -1310,6 +1366,7 @@ mod test {
1310
1366
MAX_CLOSEST_COMPLETION_REPAIRS ,
1311
1367
& mut RepairTiming :: default ( ) ,
1312
1368
& mut BestRepairsStats :: default ( ) ,
1369
+ & mut HashMap :: default ( ) ,
1313
1370
) ,
1314
1371
expected
1315
1372
) ;
@@ -1627,6 +1684,7 @@ mod test {
1627
1684
& blockstore,
1628
1685
max_repairs,
1629
1686
& slots_to_repair,
1687
+ & mut HashMap :: default ( ) ,
1630
1688
) ;
1631
1689
assert ! ( result. is_empty( ) ) ;
1632
1690
@@ -1636,6 +1694,7 @@ mod test {
1636
1694
& blockstore,
1637
1695
max_repairs,
1638
1696
& slots_to_repair,
1697
+ & mut HashMap :: default ( ) ,
1639
1698
) ;
1640
1699
assert_eq ! (
1641
1700
result,
@@ -1651,6 +1710,7 @@ mod test {
1651
1710
& blockstore,
1652
1711
max_repairs,
1653
1712
& slots_to_repair,
1713
+ & mut HashMap :: default ( ) ,
1654
1714
) ;
1655
1715
assert_eq ! ( result. len( ) , max_repairs) ;
1656
1716
assert_eq ! (
0 commit comments