Skip to content

Commit 3fe3975

Browse files
authored
feat(relay): add metrics for total items spooled and their size (#4511)
This PR extends the autoscaling endpoint by providing data about how many items are spooled and also how much bytes they use, which is currently only available for sqlite spooling.
1 parent 7eea540 commit 3fe3975

File tree

11 files changed

+264
-52
lines changed

11 files changed

+264
-52
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
- Track an utilization metric for internal services. ([#4501](https://github.com/getsentry/relay/pull/4501))
1414
- Add new `relay-threading` crate with asynchronous thread pool. ([#4500](https://github.com/getsentry/relay/pull/4500))
15+
- Expose additional metrics through the internal relay metric endpoint. ([#4511](https://github.com/getsentry/relay/pull/4511))
1516

1617
## 25.2.0
1718

relay-server/src/endpoints/autoscaling.rs

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use crate::http::StatusCode;
22
use crate::service::ServiceState;
33
use crate::services::autoscaling::{AutoscalingData, AutoscalingMessageKind};
4+
use std::fmt::Display;
45

56
/// Returns internal metrics data for relay.
67
pub async fn handle(state: ServiceState) -> (StatusCode, String) {
@@ -23,17 +24,22 @@ pub async fn handle(state: ServiceState) -> (StatusCode, String) {
2324

2425
/// Simple function to serialize a well-known format into a prometheus string.
2526
fn to_prometheus_string(data: &AutoscalingData) -> String {
26-
let mut result = String::with_capacity(32);
27-
result.push_str("memory_usage ");
28-
result.push_str(&data.memory_usage.to_string());
29-
result.push('\n');
30-
result.push_str("up ");
31-
result.push_str(&data.up.to_string());
32-
result.push('\n');
27+
let mut result = String::with_capacity(128);
3328

29+
append_data_row(&mut result, "memory_usage", data.memory_usage);
30+
append_data_row(&mut result, "up", data.up);
31+
append_data_row(&mut result, "item_count", data.item_count);
32+
append_data_row(&mut result, "total_size", data.total_size);
3433
result
3534
}
3635

36+
fn append_data_row(result: &mut String, label: &str, data: impl Display) {
37+
result.push_str(label);
38+
result.push(' ');
39+
result.push_str(&data.to_string());
40+
result.push('\n');
41+
}
42+
3743
#[cfg(test)]
3844
mod test {
3945
use crate::services::autoscaling::AutoscalingData;
@@ -43,8 +49,13 @@ mod test {
4349
let data = AutoscalingData {
4450
memory_usage: 0.75,
4551
up: 1,
52+
item_count: 10,
53+
total_size: 30,
4654
};
4755
let result = super::to_prometheus_string(&data);
48-
assert_eq!(result, "memory_usage 0.75\nup 1\n");
56+
assert_eq!(
57+
result,
58+
"memory_usage 0.75\nup 1\nitem_count 10\ntotal_size 30\n"
59+
);
4960
}
5061
}

relay-server/src/service.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ pub struct Registry {
7272
pub envelope_buffer: PartitionedEnvelopeBuffer,
7373

7474
pub project_cache_handle: ProjectCacheHandle,
75-
pub keda: Addr<AutoscalingMetrics>,
75+
pub autoscaling: Addr<AutoscalingMetrics>,
7676
}
7777

7878
/// Constructs a Tokio [`relay_system::Runtime`] configured for running [services](relay_system::Service).
@@ -189,8 +189,6 @@ impl ServiceState {
189189
let outcome_aggregator =
190190
services.start(OutcomeAggregator::new(&config, outcome_producer.clone()));
191191

192-
let keda = services.start(AutoscalingMetricService::new(memory_stat.clone()));
193-
194192
let (global_config, global_config_rx) =
195193
GlobalConfigService::new(config.clone(), upstream_relay.clone());
196194
let global_config_handle = global_config.handle();
@@ -288,6 +286,11 @@ impl ServiceState {
288286
envelope_buffer.clone(),
289287
));
290288

289+
let autoscaling = services.start(AutoscalingMetricService::new(
290+
memory_stat.clone(),
291+
envelope_buffer.clone(),
292+
));
293+
291294
services.start(RelayStats::new(
292295
config.clone(),
293296
handle.clone(),
@@ -312,7 +315,7 @@ impl ServiceState {
312315
project_cache_handle,
313316
upstream_relay,
314317
envelope_buffer,
315-
keda,
318+
autoscaling,
316319
};
317320

318321
let state = StateInner {
@@ -339,7 +342,7 @@ impl ServiceState {
339342
}
340343

341344
pub fn autoscaling(&self) -> &Addr<AutoscalingMetrics> {
342-
&self.inner.registry.keda
345+
&self.inner.registry.autoscaling
343346
}
344347

345348
/// Returns the V2 envelope buffer, if present.

relay-server/src/services/autoscaling.rs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,22 @@
1+
use crate::services::buffer::PartitionedEnvelopeBuffer;
12
use crate::MemoryStat;
23
use relay_system::{AsyncResponse, Controller, FromMessage, Interface, Sender, Service};
34
use serde::Serialize;
45

56
/// Service that tracks internal relay metrics so that they can be exposed.
67
pub struct AutoscalingMetricService {
78
memory_stat: MemoryStat,
9+
envelope_buffer: PartitionedEnvelopeBuffer,
810
up: u8,
911
}
1012

1113
impl AutoscalingMetricService {
12-
pub fn new(memory_stat: MemoryStat) -> Self {
13-
Self { memory_stat, up: 1 }
14+
pub fn new(memory_stat: MemoryStat, envelope_buffer: PartitionedEnvelopeBuffer) -> Self {
15+
Self {
16+
memory_stat,
17+
envelope_buffer,
18+
up: 1,
19+
}
1420
}
1521
}
1622

@@ -28,7 +34,12 @@ impl Service for AutoscalingMetricService {
2834
match message {
2935
AutoscalingMetrics::Check(sender) => {
3036
let memory_usage = self.memory_stat.memory();
31-
sender.send(AutoscalingData::new(memory_usage.used_percent(), self.up));
37+
sender.send(AutoscalingData {
38+
memory_usage: memory_usage.used_percent(),
39+
up: self.up,
40+
total_size: self.envelope_buffer.total_storage_size(),
41+
item_count: self.envelope_buffer.item_count()
42+
});
3243
}
3344
}
3445
}
@@ -65,10 +76,6 @@ impl FromMessage<AutoscalingMessageKind> for AutoscalingMetrics {
6576
pub struct AutoscalingData {
6677
pub memory_usage: f32,
6778
pub up: u8,
68-
}
69-
70-
impl AutoscalingData {
71-
pub fn new(memory_usage: f32, up: u8) -> Self {
72-
Self { memory_usage, up }
73-
}
79+
pub total_size: u64,
80+
pub item_count: u64,
7481
}

relay-server/src/services/buffer/envelope_buffer/mod.rs

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@ use std::collections::BTreeSet;
33
use std::convert::Infallible;
44
use std::error::Error;
55
use std::mem;
6-
use std::sync::atomic::AtomicI64;
7-
use std::sync::atomic::Ordering as AtomicOrdering;
8-
use std::sync::Arc;
96
use std::time::Duration;
107

118
use chrono::{DateTime, Utc};
@@ -165,6 +162,24 @@ impl PolymorphicEnvelopeBuffer {
165162
}
166163
}
167164

165+
/// Returns the total number of envelopes that have been spooled since the startup. It does
166+
/// not include the count that existed in a persistent spooler before.
167+
pub fn item_count(&self) -> u64 {
168+
match self {
169+
Self::Sqlite(buffer) => buffer.tracked_count,
170+
Self::InMemory(buffer) => buffer.tracked_count,
171+
}
172+
}
173+
174+
/// Returns the total number of bytes that the spooler storage uses or `None` if the number
175+
/// cannot be reliably determined.
176+
pub fn total_size(&self) -> Option<u64> {
177+
match self {
178+
Self::Sqlite(buffer) => buffer.stack_provider.total_size(),
179+
Self::InMemory(buffer) => buffer.stack_provider.total_size(),
180+
}
181+
}
182+
168183
/// Shuts down the [`PolymorphicEnvelopeBuffer`].
169184
pub async fn shutdown(&mut self) -> bool {
170185
// Currently, we want to flush the buffer only for disk, since the in memory implementation
@@ -228,7 +243,13 @@ struct EnvelopeBuffer<P: StackProvider> {
228243
/// count might not succeed if it takes more than a set timeout. For example, if we load the
229244
/// count of all envelopes from disk, and it takes more than the time we set, we will mark the
230245
/// initial count as 0 and just count incoming and outgoing envelopes from the buffer.
231-
total_count: Arc<AtomicI64>,
246+
total_count: i64,
247+
/// The total count of envelopes that the buffer is working with ignoring envelopes that
248+
/// were previously stored on disk.
249+
///
250+
/// On startup this will always be 0 and will only count incoming envelopes. If a reliable
251+
/// count of currently buffered envelopes is required, prefer this over `total_count`
252+
tracked_count: u64,
232253
/// Whether the count initialization succeeded or not.
233254
///
234255
/// This boolean is just used for tagging the metric that tracks the total count of envelopes
@@ -245,7 +266,8 @@ impl EnvelopeBuffer<MemoryStackProvider> {
245266
stacks_by_project: Default::default(),
246267
priority_queue: Default::default(),
247268
stack_provider: MemoryStackProvider::new(memory_checker),
248-
total_count: Arc::new(AtomicI64::new(0)),
269+
total_count: 0,
270+
tracked_count: 0,
249271
total_count_initialized: false,
250272
partition_tag: partition_id.to_string(),
251273
}
@@ -260,7 +282,8 @@ impl EnvelopeBuffer<SqliteStackProvider> {
260282
stacks_by_project: Default::default(),
261283
priority_queue: Default::default(),
262284
stack_provider: SqliteStackProvider::new(partition_id, config).await?,
263-
total_count: Arc::new(AtomicI64::new(0)),
285+
total_count: 0,
286+
tracked_count: 0,
264287
total_count_initialized: false,
265288
partition_tag: partition_id.to_string(),
266289
})
@@ -318,7 +341,8 @@ where
318341
prio.received_at = received_at;
319342
});
320343

321-
self.total_count.fetch_add(1, AtomicOrdering::SeqCst);
344+
self.total_count += 1;
345+
self.tracked_count += 1;
322346
self.track_total_count();
323347

324348
Ok(())
@@ -385,7 +409,8 @@ where
385409
// We are fine with the count going negative, since it represents that more data was popped,
386410
// than it was initially counted, meaning that we had a wrong total count from
387411
// initialization.
388-
self.total_count.fetch_sub(1, AtomicOrdering::SeqCst);
412+
self.total_count -= 1;
413+
self.tracked_count = self.tracked_count.saturating_sub(1);
389414
self.track_total_count();
390415

391416
Ok(Some(envelope))
@@ -529,8 +554,7 @@ where
529554
.await;
530555
match total_count {
531556
Ok(total_count) => {
532-
self.total_count
533-
.store(total_count as i64, AtomicOrdering::SeqCst);
557+
self.total_count = total_count as i64;
534558
self.total_count_initialized = true;
535559
}
536560
Err(error) => {
@@ -546,7 +570,7 @@ where
546570

547571
/// Emits a metric to track the total count of envelopes that are in the envelope buffer.
548572
fn track_total_count(&self) {
549-
let total_count = self.total_count.load(AtomicOrdering::SeqCst) as f64;
573+
let total_count = self.total_count as f64;
550574
let initialized = match self.total_count_initialized {
551575
true => "true",
552576
false => "false",

0 commit comments

Comments
 (0)