Skip to content

Commit ddd01e5

Browse files
chore: add penalty in latency-based routing
1 parent 386d353 commit ddd01e5

File tree

1 file changed

+50
-22
lines changed

1 file changed

+50
-22
lines changed

Diff for: ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs

+50-22
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ use crate::agent::http_transport::dynamic_routing::{
77
health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot,
88
};
99

10-
// Some big value implying that node is unhealthy, should be much bigger than node's latency.
11-
const MAX_LATENCY: Duration = Duration::from_secs(500);
10+
// When a node is detected as unhealthy, we take the following actions:
11+
// - Remove the node entirely from the routing.
12+
// - Penalize its moving average by adding a specified value to the stored latency window. This ensures that any node exhibiting intermittent outages is appropriately penalized.
13+
const PUNISH_LATENCY: Duration = Duration::from_secs(2);
1214

1315
const WINDOW_SIZE: usize = 15;
1416

@@ -20,6 +22,8 @@ type LatencyMovAvg = SumTreeSMA<Duration, u32, WINDOW_SIZE>;
2022
#[derive(Clone, Debug)]
2123
struct WeightedNode {
2224
node: Node,
25+
/// Reflects the status of the most recent health check.
26+
is_healthy: bool,
2327
/// Moving mean of latencies measurements.
2428
latency_mov_avg: LatencyMovAvg,
2529
/// Weight of the node (invers of the average latency), used for stochastic weighted random sampling.
@@ -49,14 +53,14 @@ impl LatencyRoutingSnapshot {
4953
/// Helper function to sample nodes based on their weights.
5054
/// Here weight index is selected based on the input number in range [0, 1]
5155
#[inline(always)]
52-
fn weighted_sample(weights: &[f64], number: f64) -> Option<usize> {
56+
fn weighted_sample(weighted_nodes: &[(f64, &Node)], number: f64) -> Option<usize> {
5357
if !(0.0..=1.0).contains(&number) {
5458
return None;
5559
}
56-
let sum: f64 = weights.iter().sum();
60+
let sum: f64 = weighted_nodes.iter().map(|n| n.0).sum();
5761
let mut weighted_number = number * sum;
58-
for (idx, weight) in weights.iter().enumerate() {
59-
weighted_number -= weight;
62+
for (idx, weighted_node) in weighted_nodes.iter().enumerate() {
63+
weighted_number -= weighted_node.0;
6064
if weighted_number <= 0.0 {
6165
return Some(idx);
6266
}
@@ -70,18 +74,21 @@ impl RoutingSnapshot for LatencyRoutingSnapshot {
7074
}
7175

7276
fn next(&self) -> Option<Node> {
73-
// We select a node based on it's weight, using a stochastic weighted random sampling approach.
74-
let weights = self
75-
.weighted_nodes
76-
.iter()
77-
.map(|n| n.weight)
78-
.collect::<Vec<_>>();
77+
// We select a healthy node based on its weight, using a stochastic weighted random sampling approach.
78+
79+
// Preallocate array for a better efficiency.
80+
let mut healthy_weighted_nodes = Vec::with_capacity(self.weighted_nodes.len());
81+
for n in &self.weighted_nodes {
82+
if n.is_healthy {
83+
healthy_weighted_nodes.push((n.weight, &n.node));
84+
}
85+
}
7986
// Generate a random float in the range [0, 1)
8087
let mut rng = rand::thread_rng();
8188
let rand_num = rng.gen::<f64>();
8289
// Using this random float and an array of weights we get an index of the node.
83-
let idx = weighted_sample(weights.as_slice(), rand_num);
84-
idx.map(|idx| self.weighted_nodes[idx].node.clone())
90+
let idx = weighted_sample(healthy_weighted_nodes.as_slice(), rand_num);
91+
idx.map(|idx| healthy_weighted_nodes[idx].1.clone())
8592
}
8693

8794
fn sync_nodes(&mut self, nodes: &[Node]) -> bool {
@@ -116,11 +123,12 @@ impl RoutingSnapshot for LatencyRoutingSnapshot {
116123
return false;
117124
}
118125

119-
// If latency is None (meaning Node is unhealthy), we assign some big value
120-
let latency = health.latency().unwrap_or(MAX_LATENCY);
126+
// If the node is unhealthy, we penalize it's moving average.
127+
let latency = health.latency().unwrap_or(PUNISH_LATENCY);
121128

122129
if let Some(idx) = self.weighted_nodes.iter().position(|x| &x.node == node) {
123130
// Node is already in the array (it is not the first update_node() call).
131+
self.weighted_nodes[idx].is_healthy = health.is_healthy();
124132
self.weighted_nodes[idx].latency_mov_avg.add_sample(latency);
125133
let latency_avg = self.weighted_nodes[idx].latency_mov_avg.get_average();
126134
// As nodes with smaller average latencies are preferred for routing, we use inverted values for weights.
@@ -131,6 +139,7 @@ impl RoutingSnapshot for LatencyRoutingSnapshot {
131139
latency_mov_avg.add_sample(latency);
132140
let weight = 1.0 / latency_mov_avg.get_average().as_secs_f64();
133141
self.weighted_nodes.push(WeightedNode {
142+
is_healthy: health.is_healthy(),
134143
latency_mov_avg,
135144
node: node.clone(),
136145
weight,
@@ -152,7 +161,8 @@ mod tests {
152161
node::Node,
153162
snapshot::{
154163
latency_based_routing::{
155-
weighted_sample, LatencyMovAvg, LatencyRoutingSnapshot, WeightedNode, MAX_LATENCY,
164+
weighted_sample, LatencyMovAvg, LatencyRoutingSnapshot, WeightedNode,
165+
PUNISH_LATENCY,
156166
},
157167
routing_snapshot::RoutingSnapshot,
158168
},
@@ -212,6 +222,7 @@ mod tests {
212222
Duration::from_millis(1500)
213223
);
214224
assert_eq!(weighted_node.weight, 1.0 / 1.5);
225+
assert_eq!(snapshot.next().unwrap(), node);
215226
// Check third update
216227
let health = HealthCheckStatus::new(Some(Duration::from_secs(3)));
217228
let is_updated = snapshot.update_node(&node, health);
@@ -222,12 +233,25 @@ mod tests {
222233
Duration::from_millis(2000)
223234
);
224235
assert_eq!(weighted_node.weight, 0.5);
236+
assert_eq!(snapshot.next().unwrap(), node);
225237
// Check forth update with none
226238
let health = HealthCheckStatus::new(None);
227239
let is_updated = snapshot.update_node(&node, health);
228240
assert!(is_updated);
229241
let weighted_node = snapshot.weighted_nodes.first().unwrap();
230-
let avg_latency = Duration::from_secs_f64((MAX_LATENCY.as_secs() as f64 + 6.0) / 4.0);
242+
let avg_latency = Duration::from_secs_f64((PUNISH_LATENCY.as_secs() as f64 + 6.0) / 4.0);
243+
assert_eq!(weighted_node.latency_mov_avg.get_average(), avg_latency);
244+
assert_eq!(weighted_node.weight, 1.0 / avg_latency.as_secs_f64());
245+
assert_eq!(snapshot.weighted_nodes.len(), 1);
246+
assert_eq!(snapshot.existing_nodes.len(), 1);
247+
// No nodes returned, as the node is unhealthy.
248+
assert!(snapshot.next().is_none());
249+
// Check fifth update
250+
let health = HealthCheckStatus::new(Some(Duration::from_secs(1)));
251+
let is_updated = snapshot.update_node(&node, health);
252+
assert!(is_updated);
253+
let weighted_node = snapshot.weighted_nodes.first().unwrap();
254+
let avg_latency = Duration::from_secs_f64((PUNISH_LATENCY.as_secs() as f64 + 7.0) / 5.0);
231255
assert_eq!(weighted_node.latency_mov_avg.get_average(), avg_latency);
232256
assert_eq!(weighted_node.weight, 1.0 / avg_latency.as_secs_f64());
233257
assert_eq!(snapshot.weighted_nodes.len(), 1);
@@ -250,6 +274,7 @@ mod tests {
250274
);
251275
// Add node_1 to weighted_nodes manually
252276
snapshot.weighted_nodes.push(WeightedNode {
277+
is_healthy: true,
253278
node: node_1.clone(),
254279
latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO),
255280
weight: 0.0,
@@ -274,6 +299,7 @@ mod tests {
274299
assert!(snapshot.weighted_nodes.is_empty());
275300
// Add node_2 to weighted_nodes manually
276301
snapshot.weighted_nodes.push(WeightedNode {
302+
is_healthy: true,
277303
node: node_2.clone(),
278304
latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO),
279305
weight: 0.0,
@@ -289,6 +315,7 @@ mod tests {
289315
assert_eq!(snapshot.weighted_nodes[0].node, node_2);
290316
// Add node_3 to weighted_nodes manually
291317
snapshot.weighted_nodes.push(WeightedNode {
318+
is_healthy: true,
292319
node: node_3,
293320
latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO),
294321
weight: 0.0,
@@ -308,11 +335,12 @@ mod tests {
308335
#[test]
309336
fn test_weighted_sample() {
310337
// Case 1: empty array
311-
let arr: &[f64] = &[];
338+
let node = Node::new("ic0.com").unwrap();
339+
let arr = &[(0.5, &node)];
312340
let idx = weighted_sample(arr, 0.5);
313341
assert_eq!(idx, None);
314342
// Case 2: single element in array
315-
let arr: &[f64] = &[1.0];
343+
let arr = &[(1.0, &node)];
316344
let idx = weighted_sample(arr, 0.0);
317345
assert_eq!(idx, Some(0));
318346
let idx = weighted_sample(arr, 1.0);
@@ -323,7 +351,7 @@ mod tests {
323351
let idx = weighted_sample(arr, 1.1);
324352
assert_eq!(idx, None);
325353
// Case 3: two elements in array (second element has twice the weight of the first)
326-
let arr: &[f64] = &[1.0, 2.0]; // prefixed_sum = [1.0, 3.0]
354+
let arr = &[(1.0, &node), (2.0, &node)]; // prefixed_sum = [1.0, 3.0]
327355
let idx = weighted_sample(arr, 0.0); // 0.0 * 3.0 < 1.0
328356
assert_eq!(idx, Some(0));
329357
let idx = weighted_sample(arr, 0.33); // 0.33 * 3.0 < 1.0
@@ -338,7 +366,7 @@ mod tests {
338366
let idx = weighted_sample(arr, 1.1);
339367
assert_eq!(idx, None);
340368
// Case 4: four elements in array
341-
let arr: &[f64] = &[1.0, 2.0, 1.5, 2.5]; // prefixed_sum = [1.0, 3.0, 4.5, 7.0]
369+
let arr = &[(1.0, &node), (2.0, &node), (1.5, &node), (2.5, &node)]; // prefixed_sum = [1.0, 3.0, 4.5, 7.0]
342370
let idx = weighted_sample(arr, 0.14); // 0.14 * 7 < 1.0
343371
assert_eq!(idx, Some(0)); // probability ~0.14
344372
let idx = weighted_sample(arr, 0.15); // 0.15 * 7 > 1.0

0 commit comments

Comments
 (0)