Skip to content

Commit cbe5c4b

Browse files
iwanbkLeeSmet
authored andcommitted
add connection_status gauge
which explicity shows 0-db connection status (up or down)
1 parent 5d20f09 commit cbe5c4b

File tree

1 file changed

+43
-10
lines changed

1 file changed

+43
-10
lines changed

zstor/src/actors/metrics.rs

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,15 @@ pub struct MetricsActor {
3030
data_zdbs: HashMap<ZdbConnectionInfo, NsInfo>,
3131
meta_zdbs: HashMap<ZdbConnectionInfo, NsInfo>,
3232
removed_zdbs: Vec<(ZdbConnectionInfo, BackendType)>,
33+
dead_zdbs: HashMap<ZdbConnectionInfo, BackendType>,
3334
successful_zstor_commands: HashMap<ZstorCommandId, usize>,
3435
failed_zstor_commands: HashMap<ZstorCommandId, usize>,
3536
zdbfs_stats: stats_t,
3637
prom_metrics: PromMetrics,
3738
}
3839

3940
struct PromMetrics {
41+
connection_status_gauges: IntGaugeVec,
4042
entries_gauges: IntGaugeVec,
4143
data_size_bytes_gauges: IntGaugeVec,
4244
data_limit_bytes_gauges: IntGaugeVec,
@@ -77,6 +79,7 @@ impl MetricsActor {
7779
data_zdbs: HashMap::new(),
7880
meta_zdbs: HashMap::new(),
7981
removed_zdbs: Vec::new(),
82+
dead_zdbs: HashMap::new(),
8083
successful_zstor_commands: HashMap::new(),
8184
failed_zstor_commands: HashMap::new(),
8285
zdbfs_stats: stats_t::default(),
@@ -87,6 +90,12 @@ impl MetricsActor {
8790

8891
fn setup_prometheus() -> PromMetrics {
8992
PromMetrics {
93+
connection_status_gauges: register_int_gauge_vec!(
94+
"connection_status",
95+
"Status of the connection to the 0-db",
96+
&["address", "namespace", "backend_type"]
97+
)
98+
.unwrap(),
9099
entries_gauges: register_int_gauge_vec!(
91100
"entries",
92101
"entries in namespace",
@@ -307,14 +316,16 @@ impl Handler<SetDataBackendInfo> for MetricsActor {
307316

308317
fn handle(&mut self, msg: SetDataBackendInfo, _: &mut Self::Context) -> Self::Result {
309318
if let Some(info) = msg.info {
310-
self.data_zdbs.insert(msg.ci, info);
319+
self.data_zdbs.insert(msg.ci.clone(), info);
320+
self.dead_zdbs.remove(&msg.ci);
311321
} else {
312-
let v = self.data_zdbs.remove(&msg.ci);
322+
let nsinfo: Option<NsInfo> = self.data_zdbs.remove(&msg.ci);
313323
// when the zdb is down, backend actors always send a None info
314324
// in this case we should remove the zdb from the metrics *only* if it was present.
315-
// Otherwise we will do unnecessary work and the `removed_zdbs` list will exploded
316-
if v.is_some() {
317-
self.removed_zdbs.push((msg.ci, BackendType::Data));
325+
// Otherwise we will do unnecessary work and the `removed_zdbs` list might exploded
326+
if nsinfo.is_some() {
327+
self.removed_zdbs.push((msg.ci.clone(), BackendType::Data));
328+
self.dead_zdbs.insert(msg.ci, BackendType::Data);
318329
}
319330
}
320331
}
@@ -325,14 +336,16 @@ impl Handler<SetMetaBackendInfo> for MetricsActor {
325336

326337
fn handle(&mut self, msg: SetMetaBackendInfo, _: &mut Self::Context) -> Self::Result {
327338
if let Some(info) = msg.info {
328-
self.meta_zdbs.insert(msg.ci, info);
339+
self.meta_zdbs.insert(msg.ci.clone(), info);
340+
self.dead_zdbs.remove(&msg.ci);
329341
} else {
330-
let v = self.meta_zdbs.remove(&msg.ci);
342+
let nsinfo = self.meta_zdbs.remove(&msg.ci);
331343
// when the zdb is down, backend actors always send a None info
332344
// in this case we should remove the zdb from the metrics *only* if it was present.
333-
// Otherwise we will do unnecessary work and the `removed_zdbs` list will exploded
334-
if v.is_some() {
335-
self.removed_zdbs.push((msg.ci, BackendType::Meta));
345+
// Otherwise we will do unnecessary work and the `removed_zdbs` list might exploded
346+
if nsinfo.is_some() {
347+
self.removed_zdbs.push((msg.ci.clone(), BackendType::Meta));
348+
self.dead_zdbs.insert(msg.ci, BackendType::Meta);
336349
}
337350
}
338351
}
@@ -434,6 +447,10 @@ impl Handler<GetPrometheusMetrics> for MetricsActor {
434447
labels.insert("address", &address);
435448
labels.insert("backend_type", backend_type.as_str());
436449

450+
let connection_status_gauge = self
451+
.prom_metrics
452+
.connection_status_gauges
453+
.get_metric_with(&labels)?;
437454
let entries_gauge = self.prom_metrics.entries_gauges.get_metric_with(&labels)?;
438455
let data_size_bytes_gauge = self
439456
.prom_metrics
@@ -461,6 +478,7 @@ impl Handler<GetPrometheusMetrics> for MetricsActor {
461478
.data_disk_freespace_bytes_gauges
462479
.get_metric_with(&labels)?;
463480

481+
connection_status_gauge.set(1);
464482
entries_gauge.set(info.entries as i64);
465483
data_size_bytes_gauge.set(info.data_size_bytes as i64);
466484
data_limit_bytes_gauge.set(
@@ -476,6 +494,21 @@ impl Handler<GetPrometheusMetrics> for MetricsActor {
476494
data_disk_freespace_bytes_gauge.set(info.data_disk_freespace_bytes as i64);
477495
}
478496

497+
// dead zdbs
498+
for (ci, backend_type) in self.dead_zdbs.iter() {
499+
let mut labels = HashMap::new();
500+
labels.insert("namespace", ci.namespace().unwrap_or(""));
501+
let address = ci.address().to_string();
502+
labels.insert("address", &address);
503+
labels.insert("backend_type", backend_type.as_str());
504+
505+
let connection_status_gauge = self
506+
.prom_metrics
507+
.connection_status_gauges
508+
.get_metric_with(&labels)?;
509+
connection_status_gauge.set(0);
510+
}
511+
479512
// Update zstor stats
480513
//
481514
// Successful calls

0 commit comments

Comments
 (0)