4242 } ,
4343 solana_streamer:: sendmmsg:: { batch_send, SendPktsError } ,
4444 std:: {
45- collections:: { HashMap , HashSet } ,
45+ collections:: { hash_map :: Entry , HashMap , HashSet } ,
4646 iter:: Iterator ,
4747 net:: { SocketAddr , UdpSocket } ,
4848 sync:: {
5959const DEFER_REPAIR_THRESHOLD : Duration = Duration :: from_millis ( 200 ) ;
6060const DEFER_REPAIR_THRESHOLD_TICKS : u64 = DEFER_REPAIR_THRESHOLD . as_millis ( ) as u64 / MS_PER_TICK ;
6161
62+ // This is the amount of time we will wait for a repair request to be fulfilled
63+ // before making another request. Value is based on reasonable upper bound of
64+ // expected network delays in requesting repairs and receiving shreds.
65+ const REPAIR_REQUEST_TIMEOUT_MS : u64 = 100 ;
66+
6267// When requesting repair for a specific shred through the admin RPC, we will
6368// request up to NUM_PEERS_TO_SAMPLE_FOR_REPAIRS in the event a specific, valid
6469// target node is not provided. This number was chosen to provide reasonable
@@ -208,7 +213,7 @@ impl BestRepairsStats {
208213pub const MAX_REPAIR_LENGTH : usize = 512 ;
209214pub const MAX_REPAIR_PER_DUPLICATE : usize = 20 ;
210215pub const MAX_DUPLICATE_WAIT_MS : usize = 10_000 ;
211- pub const REPAIR_MS : u64 = 100 ;
216+ pub const REPAIR_MS : u64 = 1 ;
212217pub const MAX_ORPHANS : usize = 5 ;
213218pub const MAX_UNKNOWN_LAST_INDEX_REPAIRS : usize = 10 ;
214219pub const MAX_CLOSEST_COMPLETION_REPAIRS : usize = 100 ;
@@ -328,6 +333,8 @@ impl RepairService {
328333 let mut last_stats = Instant :: now ( ) ;
329334 let mut peers_cache = LruCache :: new ( REPAIR_PEERS_CACHE_CAPACITY ) ;
330335 let mut popular_pruned_forks_requests = HashSet :: new ( ) ;
336+ // Maps a repair that may still be outstanding to the timestamp it was requested.
337+ let mut outstanding_repairs = HashMap :: new ( ) ;
331338
332339 while !exit. load ( Ordering :: Relaxed ) {
333340 let mut set_root_elapsed;
@@ -399,11 +406,17 @@ impl RepairService {
399406 ) ;
400407 add_votes_elapsed. stop ( ) ;
401408
409+ // Purge old entries. They've either completed or need to be retried.
410+ outstanding_repairs. retain ( |_repair_request, time| {
411+ timestamp ( ) . saturating_sub ( * time) < REPAIR_REQUEST_TIMEOUT_MS
412+ } ) ;
413+
402414 let repairs = match repair_info. wen_restart_repair_slots . clone ( ) {
403415 Some ( slots_to_repair) => Self :: generate_repairs_for_wen_restart (
404416 blockstore,
405417 MAX_REPAIR_LENGTH ,
406418 & slots_to_repair. read ( ) . unwrap ( ) ,
419+ & mut outstanding_repairs,
407420 ) ,
408421 None => repair_weight. get_best_weighted_repairs (
409422 blockstore,
@@ -415,6 +428,7 @@ impl RepairService {
415428 MAX_CLOSEST_COMPLETION_REPAIRS ,
416429 & mut repair_timing,
417430 & mut best_repairs_stats,
431+ & mut outstanding_repairs,
418432 ) ,
419433 } ;
420434
@@ -631,17 +645,33 @@ impl RepairService {
631645 slot : Slot ,
632646 slot_meta : & SlotMeta ,
633647 max_repairs : usize ,
648+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
634649 ) -> Vec < ShredRepairType > {
635- Self :: generate_repairs_for_slot ( blockstore, slot, slot_meta, max_repairs, true )
650+ Self :: generate_repairs_for_slot (
651+ blockstore,
652+ slot,
653+ slot_meta,
654+ max_repairs,
655+ true ,
656+ outstanding_repairs,
657+ )
636658 }
637659
638660 pub fn generate_repairs_for_slot_not_throttled_by_tick (
639661 blockstore : & Blockstore ,
640662 slot : Slot ,
641663 slot_meta : & SlotMeta ,
642664 max_repairs : usize ,
665+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
643666 ) -> Vec < ShredRepairType > {
644- Self :: generate_repairs_for_slot ( blockstore, slot, slot_meta, max_repairs, false )
667+ Self :: generate_repairs_for_slot (
668+ blockstore,
669+ slot,
670+ slot_meta,
671+ max_repairs,
672+ false ,
673+ outstanding_repairs,
674+ )
645675 }
646676
647677 /// If this slot is missing shreds generate repairs
@@ -651,6 +681,7 @@ impl RepairService {
651681 slot_meta : & SlotMeta ,
652682 max_repairs : usize ,
653683 throttle_requests_by_shred_tick : bool ,
684+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
654685 ) -> Vec < ShredRepairType > {
655686 let defer_repair_threshold_ticks = if throttle_requests_by_shred_tick {
656687 DEFER_REPAIR_THRESHOLD_TICKS
@@ -680,7 +711,14 @@ impl RepairService {
680711 }
681712 }
682713 }
683- vec ! [ ShredRepairType :: HighestShred ( slot, slot_meta. received) ]
714+
715+ let repair_request = ShredRepairType :: HighestShred ( slot, slot_meta. received ) ;
716+ if let Entry :: Vacant ( entry) = outstanding_repairs. entry ( repair_request) {
717+ entry. insert ( timestamp ( ) ) ;
718+ vec ! [ repair_request]
719+ } else {
720+ vec ! [ ]
721+ }
684722 } else {
685723 blockstore
686724 . find_missing_data_indexes (
@@ -692,7 +730,15 @@ impl RepairService {
692730 max_repairs,
693731 )
694732 . into_iter ( )
695- . map ( |i| ShredRepairType :: Shred ( slot, i) )
733+ . filter_map ( |i| {
734+ let repair_request = ShredRepairType :: Shred ( slot, i) ;
735+ if let Entry :: Vacant ( entry) = outstanding_repairs. entry ( repair_request) {
736+ entry. insert ( timestamp ( ) ) ;
737+ Some ( repair_request)
738+ } else {
739+ None
740+ }
741+ } )
696742 . collect ( )
697743 }
698744 }
@@ -703,6 +749,7 @@ impl RepairService {
703749 repairs : & mut Vec < ShredRepairType > ,
704750 max_repairs : usize ,
705751 slot : Slot ,
752+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
706753 ) {
707754 let mut pending_slots = vec ! [ slot] ;
708755 while repairs. len ( ) < max_repairs && !pending_slots. is_empty ( ) {
@@ -713,6 +760,7 @@ impl RepairService {
713760 slot,
714761 & slot_meta,
715762 max_repairs - repairs. len ( ) ,
763+ outstanding_repairs,
716764 ) ;
717765 repairs. extend ( new_repairs) ;
718766 let next_slots = slot_meta. next_slots ;
@@ -727,6 +775,7 @@ impl RepairService {
727775 blockstore : & Blockstore ,
728776 max_repairs : usize ,
729777 slots : & Vec < Slot > ,
778+ outstanding_repairs : & mut HashMap < ShredRepairType , u64 > ,
730779 ) -> Vec < ShredRepairType > {
731780 let mut repairs: Vec < ShredRepairType > = Vec :: new ( ) ;
732781 for slot in slots {
@@ -738,6 +787,7 @@ impl RepairService {
738787 * slot,
739788 & slot_meta,
740789 max_repairs - repairs. len ( ) ,
790+ outstanding_repairs,
741791 ) ;
742792 repairs. extend ( new_repairs) ;
743793 } else {
@@ -911,6 +961,7 @@ impl RepairService {
911961 slot,
912962 & meta,
913963 max_repairs - repairs. len ( ) ,
964+ & mut HashMap :: default ( ) ,
914965 ) ;
915966 repairs. extend ( new_repairs) ;
916967 }
@@ -933,6 +984,7 @@ impl RepairService {
933984 slot,
934985 & slot_meta,
935986 MAX_REPAIR_PER_DUPLICATE ,
987+ & mut HashMap :: default ( ) ,
936988 ) )
937989 }
938990 } else {
@@ -1163,6 +1215,7 @@ mod test {
11631215 MAX_CLOSEST_COMPLETION_REPAIRS ,
11641216 & mut RepairTiming :: default ( ) ,
11651217 & mut BestRepairsStats :: default ( ) ,
1218+ & mut HashMap :: default ( ) ,
11661219 ) ,
11671220 vec![
11681221 ShredRepairType :: Orphan ( 2 ) ,
@@ -1195,6 +1248,7 @@ mod test {
11951248 MAX_CLOSEST_COMPLETION_REPAIRS ,
11961249 & mut RepairTiming :: default ( ) ,
11971250 & mut BestRepairsStats :: default ( ) ,
1251+ & mut HashMap :: default ( ) ,
11981252 ) ,
11991253 vec![ ShredRepairType :: HighestShred ( 0 , 0 ) ]
12001254 ) ;
@@ -1252,6 +1306,7 @@ mod test {
12521306 MAX_CLOSEST_COMPLETION_REPAIRS ,
12531307 & mut RepairTiming :: default ( ) ,
12541308 & mut BestRepairsStats :: default ( ) ,
1309+ & mut HashMap :: default ( ) ,
12551310 ) ,
12561311 expected
12571312 ) ;
@@ -1267,6 +1322,7 @@ mod test {
12671322 MAX_CLOSEST_COMPLETION_REPAIRS ,
12681323 & mut RepairTiming :: default ( ) ,
12691324 & mut BestRepairsStats :: default ( ) ,
1325+ & mut HashMap :: default ( ) ,
12701326 ) [ ..] ,
12711327 expected[ 0 ..expected. len( ) - 2 ]
12721328 ) ;
@@ -1310,6 +1366,7 @@ mod test {
13101366 MAX_CLOSEST_COMPLETION_REPAIRS ,
13111367 & mut RepairTiming :: default ( ) ,
13121368 & mut BestRepairsStats :: default ( ) ,
1369+ & mut HashMap :: default ( ) ,
13131370 ) ,
13141371 expected
13151372 ) ;
@@ -1627,6 +1684,7 @@ mod test {
16271684 & blockstore,
16281685 max_repairs,
16291686 & slots_to_repair,
1687+ & mut HashMap :: default ( ) ,
16301688 ) ;
16311689 assert ! ( result. is_empty( ) ) ;
16321690
@@ -1636,6 +1694,7 @@ mod test {
16361694 & blockstore,
16371695 max_repairs,
16381696 & slots_to_repair,
1697+ & mut HashMap :: default ( ) ,
16391698 ) ;
16401699 assert_eq ! (
16411700 result,
@@ -1651,6 +1710,7 @@ mod test {
16511710 & blockstore,
16521711 max_repairs,
16531712 & slots_to_repair,
1713+ & mut HashMap :: default ( ) ,
16541714 ) ;
16551715 assert_eq ! ( result. len( ) , max_repairs) ;
16561716 assert_eq ! (
0 commit comments