Skip to content

Commit d9b8221

Browse files
committed
unthrottle repair
1 parent f76c121 commit d9b8221

File tree

4 files changed

+172
-27
lines changed

4 files changed

+172
-27
lines changed

core/src/repair/repair_generic_traversal.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ use {
44
repair::{repair_service::RepairService, serve_repair::ShredRepairType},
55
},
66
solana_ledger::{blockstore::Blockstore, blockstore_meta::SlotMeta},
7-
solana_sdk::{clock::Slot, hash::Hash},
8-
std::collections::{HashMap, HashSet},
7+
solana_sdk::{clock::Slot, hash::Hash, timing::timestamp},
8+
std::collections::{hash_map::Entry, HashMap, HashSet},
99
};
1010

1111
struct GenericTraversal<'a> {
@@ -47,6 +47,7 @@ pub fn get_unknown_last_index(
4747
slot_meta_cache: &mut HashMap<Slot, Option<SlotMeta>>,
4848
processed_slots: &mut HashSet<Slot>,
4949
limit: usize,
50+
outstanding_repairs: &mut HashMap<ShredRepairType, u64>,
5051
) -> Vec<ShredRepairType> {
5152
let iter = GenericTraversal::new(tree);
5253
let mut unknown_last = Vec::new();
@@ -74,8 +75,16 @@ pub fn get_unknown_last_index(
7475
unknown_last.sort_by(|(_, _, count1), (_, _, count2)| count2.cmp(count1));
7576
unknown_last
7677
.iter()
78+
.filter_map(|(slot, received, _)| {
79+
let repair_request = ShredRepairType::HighestShred(*slot, *received);
80+
if let Entry::Vacant(entry) = outstanding_repairs.entry(repair_request) {
81+
entry.insert(timestamp());
82+
Some(repair_request)
83+
} else {
84+
None
85+
}
86+
})
7787
.take(limit)
78-
.map(|(slot, received, _)| ShredRepairType::HighestShred(*slot, *received))
7988
.collect()
8089
}
8190

@@ -115,6 +124,7 @@ pub fn get_closest_completion(
115124
slot_meta_cache: &mut HashMap<Slot, Option<SlotMeta>>,
116125
processed_slots: &mut HashSet<Slot>,
117126
limit: usize,
127+
outstanding_repairs: &mut HashMap<ShredRepairType, u64>,
118128
) -> (Vec<ShredRepairType>, /* processed slots */ usize) {
119129
let mut slot_dists: Vec<(Slot, u64)> = Vec::default();
120130
let iter = GenericTraversal::new(tree);
@@ -192,6 +202,7 @@ pub fn get_closest_completion(
192202
path_slot,
193203
slot_meta,
194204
limit - repairs.len(),
205+
outstanding_repairs,
195206
);
196207
repairs.extend(new_repairs);
197208
total_processed_slots += 1;
@@ -217,12 +228,14 @@ pub mod test {
217228
let last_shred = blockstore.meta(0).unwrap().unwrap().received;
218229
let mut slot_meta_cache = HashMap::default();
219230
let mut processed_slots = HashSet::default();
231+
let mut outstanding_requests = HashMap::new();
220232
let repairs = get_unknown_last_index(
221233
&heaviest_subtree_fork_choice,
222234
&blockstore,
223235
&mut slot_meta_cache,
224236
&mut processed_slots,
225237
10,
238+
&mut outstanding_requests,
226239
);
227240
assert_eq!(
228241
repairs,
@@ -231,22 +244,26 @@ pub mod test {
231244
.map(|slot| ShredRepairType::HighestShred(*slot, last_shred))
232245
.collect::<Vec<_>>()
233246
);
247+
assert_eq!(outstanding_requests.len(), repairs.len());
234248
}
235249

236250
#[test]
237251
fn test_get_closest_completion() {
238252
let (blockstore, heaviest_subtree_fork_choice) = setup_forks();
239253
let mut slot_meta_cache = HashMap::default();
240254
let mut processed_slots = HashSet::default();
255+
let mut outstanding_requests = HashMap::new();
241256
let (repairs, _) = get_closest_completion(
242257
&heaviest_subtree_fork_choice,
243258
&blockstore,
244259
0, // root_slot
245260
&mut slot_meta_cache,
246261
&mut processed_slots,
247262
10,
263+
&mut outstanding_requests,
248264
);
249265
assert_eq!(repairs, []);
266+
assert_eq!(outstanding_requests.len(), repairs.len());
250267

251268
let forks = tr(0) / (tr(1) / (tr(2) / (tr(4))) / (tr(3) / (tr(5))));
252269
let ledger_path = get_tmp_ledger_path!();
@@ -262,6 +279,7 @@ pub mod test {
262279
let heaviest_subtree_fork_choice = HeaviestSubtreeForkChoice::new_from_tree(forks);
263280
let mut slot_meta_cache = HashMap::default();
264281
let mut processed_slots = HashSet::default();
282+
outstanding_requests = HashMap::new();
265283
sleep_shred_deferment_period();
266284
let (repairs, _) = get_closest_completion(
267285
&heaviest_subtree_fork_choice,
@@ -270,8 +288,10 @@ pub mod test {
270288
&mut slot_meta_cache,
271289
&mut processed_slots,
272290
1,
291+
&mut outstanding_requests,
273292
);
274293
assert_eq!(repairs, [ShredRepairType::Shred(1, 30)]);
294+
assert_eq!(outstanding_requests.len(), repairs.len());
275295
}
276296

277297
fn add_tree_with_missing_shreds(

core/src/repair/repair_service.rs

Lines changed: 66 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ use {
4242
},
4343
solana_streamer::sendmmsg::{batch_send, SendPktsError},
4444
std::{
45-
collections::{HashMap, HashSet},
45+
collections::{hash_map::Entry, HashMap, HashSet},
4646
iter::Iterator,
4747
net::{SocketAddr, UdpSocket},
4848
sync::{
@@ -59,6 +59,11 @@ use {
5959
const DEFER_REPAIR_THRESHOLD: Duration = Duration::from_millis(200);
6060
const DEFER_REPAIR_THRESHOLD_TICKS: u64 = DEFER_REPAIR_THRESHOLD.as_millis() as u64 / MS_PER_TICK;
6161

62+
// This is the amount of time we will wait for a repair request to be fulfilled
63+
// before making another request. Value is based on reasonable upper bound of
64+
// expected network delays in requesting repairs and receiving shreds.
65+
const REPAIR_REQUEST_TIMEOUT_MS: u64 = 100;
66+
6267
// When requesting repair for a specific shred through the admin RPC, we will
6368
// request up to NUM_PEERS_TO_SAMPLE_FOR_REPAIRS in the event a specific, valid
6469
// target node is not provided. This number was chosen to provide reasonable
@@ -208,7 +213,7 @@ impl BestRepairsStats {
208213
pub const MAX_REPAIR_LENGTH: usize = 512;
209214
pub const MAX_REPAIR_PER_DUPLICATE: usize = 20;
210215
pub const MAX_DUPLICATE_WAIT_MS: usize = 10_000;
211-
pub const REPAIR_MS: u64 = 100;
216+
pub const REPAIR_MS: u64 = 1;
212217
pub const MAX_ORPHANS: usize = 5;
213218
pub const MAX_UNKNOWN_LAST_INDEX_REPAIRS: usize = 10;
214219
pub const MAX_CLOSEST_COMPLETION_REPAIRS: usize = 100;
@@ -328,6 +333,8 @@ impl RepairService {
328333
let mut last_stats = Instant::now();
329334
let mut peers_cache = LruCache::new(REPAIR_PEERS_CACHE_CAPACITY);
330335
let mut popular_pruned_forks_requests = HashSet::new();
336+
// Maps a repair that may still be outstanding to the timestamp it was requested.
337+
let mut outstanding_repairs = HashMap::new();
331338

332339
while !exit.load(Ordering::Relaxed) {
333340
let mut set_root_elapsed;
@@ -399,11 +406,17 @@ impl RepairService {
399406
);
400407
add_votes_elapsed.stop();
401408

409+
// Purge old entries. They've either completed or need to be retried.
410+
outstanding_repairs.retain(|_repair_request, time| {
411+
timestamp().saturating_sub(*time) < REPAIR_REQUEST_TIMEOUT_MS
412+
});
413+
402414
let repairs = match repair_info.wen_restart_repair_slots.clone() {
403415
Some(slots_to_repair) => Self::generate_repairs_for_wen_restart(
404416
blockstore,
405417
MAX_REPAIR_LENGTH,
406418
&slots_to_repair.read().unwrap(),
419+
&mut outstanding_repairs,
407420
),
408421
None => repair_weight.get_best_weighted_repairs(
409422
blockstore,
@@ -415,6 +428,7 @@ impl RepairService {
415428
MAX_CLOSEST_COMPLETION_REPAIRS,
416429
&mut repair_timing,
417430
&mut best_repairs_stats,
431+
&mut outstanding_repairs,
418432
),
419433
};
420434

@@ -631,17 +645,33 @@ impl RepairService {
631645
slot: Slot,
632646
slot_meta: &SlotMeta,
633647
max_repairs: usize,
648+
outstanding_repairs: &mut HashMap<ShredRepairType, u64>,
634649
) -> Vec<ShredRepairType> {
635-
Self::generate_repairs_for_slot(blockstore, slot, slot_meta, max_repairs, true)
650+
Self::generate_repairs_for_slot(
651+
blockstore,
652+
slot,
653+
slot_meta,
654+
max_repairs,
655+
true,
656+
outstanding_repairs,
657+
)
636658
}
637659

638660
pub fn generate_repairs_for_slot_not_throttled_by_tick(
639661
blockstore: &Blockstore,
640662
slot: Slot,
641663
slot_meta: &SlotMeta,
642664
max_repairs: usize,
665+
outstanding_repairs: &mut HashMap<ShredRepairType, u64>,
643666
) -> Vec<ShredRepairType> {
644-
Self::generate_repairs_for_slot(blockstore, slot, slot_meta, max_repairs, false)
667+
Self::generate_repairs_for_slot(
668+
blockstore,
669+
slot,
670+
slot_meta,
671+
max_repairs,
672+
false,
673+
outstanding_repairs,
674+
)
645675
}
646676

647677
/// If this slot is missing shreds generate repairs
@@ -651,6 +681,7 @@ impl RepairService {
651681
slot_meta: &SlotMeta,
652682
max_repairs: usize,
653683
throttle_requests_by_shred_tick: bool,
684+
outstanding_repairs: &mut HashMap<ShredRepairType, u64>,
654685
) -> Vec<ShredRepairType> {
655686
let defer_repair_threshold_ticks = if throttle_requests_by_shred_tick {
656687
DEFER_REPAIR_THRESHOLD_TICKS
@@ -680,7 +711,14 @@ impl RepairService {
680711
}
681712
}
682713
}
683-
vec![ShredRepairType::HighestShred(slot, slot_meta.received)]
714+
715+
let repair_request = ShredRepairType::HighestShred(slot, slot_meta.received);
716+
if let Entry::Vacant(entry) = outstanding_repairs.entry(repair_request) {
717+
entry.insert(timestamp());
718+
vec![repair_request]
719+
} else {
720+
vec![]
721+
}
684722
} else {
685723
blockstore
686724
.find_missing_data_indexes(
@@ -692,7 +730,15 @@ impl RepairService {
692730
max_repairs,
693731
)
694732
.into_iter()
695-
.map(|i| ShredRepairType::Shred(slot, i))
733+
.filter_map(|i| {
734+
let repair_request = ShredRepairType::Shred(slot, i);
735+
if let Entry::Vacant(entry) = outstanding_repairs.entry(repair_request) {
736+
entry.insert(timestamp());
737+
Some(repair_request)
738+
} else {
739+
None
740+
}
741+
})
696742
.collect()
697743
}
698744
}
@@ -703,6 +749,7 @@ impl RepairService {
703749
repairs: &mut Vec<ShredRepairType>,
704750
max_repairs: usize,
705751
slot: Slot,
752+
outstanding_repairs: &mut HashMap<ShredRepairType, u64>,
706753
) {
707754
let mut pending_slots = vec![slot];
708755
while repairs.len() < max_repairs && !pending_slots.is_empty() {
@@ -713,6 +760,7 @@ impl RepairService {
713760
slot,
714761
&slot_meta,
715762
max_repairs - repairs.len(),
763+
outstanding_repairs,
716764
);
717765
repairs.extend(new_repairs);
718766
let next_slots = slot_meta.next_slots;
@@ -727,6 +775,7 @@ impl RepairService {
727775
blockstore: &Blockstore,
728776
max_repairs: usize,
729777
slots: &Vec<Slot>,
778+
outstanding_repairs: &mut HashMap<ShredRepairType, u64>,
730779
) -> Vec<ShredRepairType> {
731780
let mut repairs: Vec<ShredRepairType> = Vec::new();
732781
for slot in slots {
@@ -738,6 +787,7 @@ impl RepairService {
738787
*slot,
739788
&slot_meta,
740789
max_repairs - repairs.len(),
790+
outstanding_repairs,
741791
);
742792
repairs.extend(new_repairs);
743793
} else {
@@ -911,6 +961,7 @@ impl RepairService {
911961
slot,
912962
&meta,
913963
max_repairs - repairs.len(),
964+
&mut HashMap::default(),
914965
);
915966
repairs.extend(new_repairs);
916967
}
@@ -933,6 +984,7 @@ impl RepairService {
933984
slot,
934985
&slot_meta,
935986
MAX_REPAIR_PER_DUPLICATE,
987+
&mut HashMap::default(),
936988
))
937989
}
938990
} else {
@@ -1163,6 +1215,7 @@ mod test {
11631215
MAX_CLOSEST_COMPLETION_REPAIRS,
11641216
&mut RepairTiming::default(),
11651217
&mut BestRepairsStats::default(),
1218+
&mut HashMap::default(),
11661219
),
11671220
vec![
11681221
ShredRepairType::Orphan(2),
@@ -1195,6 +1248,7 @@ mod test {
11951248
MAX_CLOSEST_COMPLETION_REPAIRS,
11961249
&mut RepairTiming::default(),
11971250
&mut BestRepairsStats::default(),
1251+
&mut HashMap::default(),
11981252
),
11991253
vec![ShredRepairType::HighestShred(0, 0)]
12001254
);
@@ -1252,6 +1306,7 @@ mod test {
12521306
MAX_CLOSEST_COMPLETION_REPAIRS,
12531307
&mut RepairTiming::default(),
12541308
&mut BestRepairsStats::default(),
1309+
&mut HashMap::default(),
12551310
),
12561311
expected
12571312
);
@@ -1267,6 +1322,7 @@ mod test {
12671322
MAX_CLOSEST_COMPLETION_REPAIRS,
12681323
&mut RepairTiming::default(),
12691324
&mut BestRepairsStats::default(),
1325+
&mut HashMap::default(),
12701326
)[..],
12711327
expected[0..expected.len() - 2]
12721328
);
@@ -1310,6 +1366,7 @@ mod test {
13101366
MAX_CLOSEST_COMPLETION_REPAIRS,
13111367
&mut RepairTiming::default(),
13121368
&mut BestRepairsStats::default(),
1369+
&mut HashMap::default(),
13131370
),
13141371
expected
13151372
);
@@ -1627,6 +1684,7 @@ mod test {
16271684
&blockstore,
16281685
max_repairs,
16291686
&slots_to_repair,
1687+
&mut HashMap::default(),
16301688
);
16311689
assert!(result.is_empty());
16321690

@@ -1636,6 +1694,7 @@ mod test {
16361694
&blockstore,
16371695
max_repairs,
16381696
&slots_to_repair,
1697+
&mut HashMap::default(),
16391698
);
16401699
assert_eq!(
16411700
result,
@@ -1651,6 +1710,7 @@ mod test {
16511710
&blockstore,
16521711
max_repairs,
16531712
&slots_to_repair,
1713+
&mut HashMap::default(),
16541714
);
16551715
assert_eq!(result.len(), max_repairs);
16561716
assert_eq!(

0 commit comments

Comments
 (0)