@@ -30,13 +30,15 @@ pub struct MetricsActor {
3030 data_zdbs : HashMap < ZdbConnectionInfo , NsInfo > ,
3131 meta_zdbs : HashMap < ZdbConnectionInfo , NsInfo > ,
3232 removed_zdbs : Vec < ( ZdbConnectionInfo , BackendType ) > ,
33+ dead_zdbs : HashMap < ZdbConnectionInfo , BackendType > ,
3334 successful_zstor_commands : HashMap < ZstorCommandId , usize > ,
3435 failed_zstor_commands : HashMap < ZstorCommandId , usize > ,
3536 zdbfs_stats : stats_t ,
3637 prom_metrics : PromMetrics ,
3738}
3839
3940struct PromMetrics {
41+ connection_status_gauges : IntGaugeVec ,
4042 entries_gauges : IntGaugeVec ,
4143 data_size_bytes_gauges : IntGaugeVec ,
4244 data_limit_bytes_gauges : IntGaugeVec ,
@@ -77,6 +79,7 @@ impl MetricsActor {
7779 data_zdbs : HashMap :: new ( ) ,
7880 meta_zdbs : HashMap :: new ( ) ,
7981 removed_zdbs : Vec :: new ( ) ,
82+ dead_zdbs : HashMap :: new ( ) ,
8083 successful_zstor_commands : HashMap :: new ( ) ,
8184 failed_zstor_commands : HashMap :: new ( ) ,
8285 zdbfs_stats : stats_t:: default ( ) ,
@@ -87,6 +90,12 @@ impl MetricsActor {
8790
8891 fn setup_prometheus ( ) -> PromMetrics {
8992 PromMetrics {
93+ connection_status_gauges : register_int_gauge_vec ! (
94+ "connection_status" ,
95+ "Status of the connection to the 0-db" ,
96+ & [ "address" , "namespace" , "backend_type" ]
97+ )
98+ . unwrap ( ) ,
9099 entries_gauges : register_int_gauge_vec ! (
91100 "entries" ,
92101 "entries in namespace" ,
@@ -307,14 +316,16 @@ impl Handler<SetDataBackendInfo> for MetricsActor {
307316
308317 fn handle ( & mut self , msg : SetDataBackendInfo , _: & mut Self :: Context ) -> Self :: Result {
309318 if let Some ( info) = msg. info {
310- self . data_zdbs . insert ( msg. ci , info) ;
319+ self . data_zdbs . insert ( msg. ci . clone ( ) , info) ;
320+ self . dead_zdbs . remove ( & msg. ci ) ;
311321 } else {
312- let v = self . data_zdbs . remove ( & msg. ci ) ;
322+ let nsinfo : Option < NsInfo > = self . data_zdbs . remove ( & msg. ci ) ;
313323 // when the zdb is down, backend actors always send a None info
314324 // in this case we should remove the zdb from the metrics *only* if it was present.
315- // Otherwise we will do unnecessary work and the `removed_zdbs` list will exploded
316- if v. is_some ( ) {
317- self . removed_zdbs . push ( ( msg. ci , BackendType :: Data ) ) ;
325+ // Otherwise we will do unnecessary work and the `removed_zdbs` list might exploded
326+ if nsinfo. is_some ( ) {
327+ self . removed_zdbs . push ( ( msg. ci . clone ( ) , BackendType :: Data ) ) ;
328+ self . dead_zdbs . insert ( msg. ci , BackendType :: Data ) ;
318329 }
319330 }
320331 }
@@ -325,14 +336,16 @@ impl Handler<SetMetaBackendInfo> for MetricsActor {
325336
326337 fn handle ( & mut self , msg : SetMetaBackendInfo , _: & mut Self :: Context ) -> Self :: Result {
327338 if let Some ( info) = msg. info {
328- self . meta_zdbs . insert ( msg. ci , info) ;
339+ self . meta_zdbs . insert ( msg. ci . clone ( ) , info) ;
340+ self . dead_zdbs . remove ( & msg. ci ) ;
329341 } else {
330- let v = self . meta_zdbs . remove ( & msg. ci ) ;
342+ let nsinfo = self . meta_zdbs . remove ( & msg. ci ) ;
331343 // when the zdb is down, backend actors always send a None info
332344 // in this case we should remove the zdb from the metrics *only* if it was present.
333- // Otherwise we will do unnecessary work and the `removed_zdbs` list will exploded
334- if v. is_some ( ) {
335- self . removed_zdbs . push ( ( msg. ci , BackendType :: Meta ) ) ;
345+ // Otherwise we will do unnecessary work and the `removed_zdbs` list might exploded
346+ if nsinfo. is_some ( ) {
347+ self . removed_zdbs . push ( ( msg. ci . clone ( ) , BackendType :: Meta ) ) ;
348+ self . dead_zdbs . insert ( msg. ci , BackendType :: Meta ) ;
336349 }
337350 }
338351 }
@@ -434,6 +447,10 @@ impl Handler<GetPrometheusMetrics> for MetricsActor {
434447 labels. insert ( "address" , & address) ;
435448 labels. insert ( "backend_type" , backend_type. as_str ( ) ) ;
436449
450+ let connection_status_gauge = self
451+ . prom_metrics
452+ . connection_status_gauges
453+ . get_metric_with ( & labels) ?;
437454 let entries_gauge = self . prom_metrics . entries_gauges . get_metric_with ( & labels) ?;
438455 let data_size_bytes_gauge = self
439456 . prom_metrics
@@ -461,6 +478,7 @@ impl Handler<GetPrometheusMetrics> for MetricsActor {
461478 . data_disk_freespace_bytes_gauges
462479 . get_metric_with ( & labels) ?;
463480
481+ connection_status_gauge. set ( 1 ) ;
464482 entries_gauge. set ( info. entries as i64 ) ;
465483 data_size_bytes_gauge. set ( info. data_size_bytes as i64 ) ;
466484 data_limit_bytes_gauge. set (
@@ -476,6 +494,21 @@ impl Handler<GetPrometheusMetrics> for MetricsActor {
476494 data_disk_freespace_bytes_gauge. set ( info. data_disk_freespace_bytes as i64 ) ;
477495 }
478496
497+ // dead zdbs
498+ for ( ci, backend_type) in self . dead_zdbs . iter ( ) {
499+ let mut labels = HashMap :: new ( ) ;
500+ labels. insert ( "namespace" , ci. namespace ( ) . unwrap_or ( "" ) ) ;
501+ let address = ci. address ( ) . to_string ( ) ;
502+ labels. insert ( "address" , & address) ;
503+ labels. insert ( "backend_type" , backend_type. as_str ( ) ) ;
504+
505+ let connection_status_gauge = self
506+ . prom_metrics
507+ . connection_status_gauges
508+ . get_metric_with ( & labels) ?;
509+ connection_status_gauge. set ( 0 ) ;
510+ }
511+
479512 // Update zstor stats
480513 //
481514 // Successful calls
0 commit comments