Skip to content

Commit 393a672

Browse files
authored
Add nflog and silences metrics (#6659)
Signed-off-by: SungJin1212 <[email protected]>
1 parent be0fc7f commit 393a672

File tree

3 files changed

+104
-10
lines changed

3 files changed

+104
-10
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
77
* [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590
88
* [FEATURE] Ingester: Support out-of-order native histogram ingestion. It automatically enabled when `-ingester.out-of-order-time-window > 0` and `-blocks-storage.tsdb.enable-native-histograms=true`. #6626 #6663
9+
* [ENHANCEMENT] Alertmanager: Add nflog and silences maintenance metrics. #6659
910
* [ENHANCEMENT] Querier: limit label APIs to query only ingesters if `start` param is not been specified. #6618
1011
* [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605
1112
* [ENHANCEMENT] Update prometheus version to v3.1.0. #6583

pkg/alertmanager/alertmanager_metrics.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ type alertmanagerMetrics struct {
3030
nflogQueryErrorsTotal *prometheus.Desc
3131
nflogQueryDuration *prometheus.Desc
3232
nflogPropagatedMessagesTotal *prometheus.Desc
33+
nflogMaintenanceTotal *prometheus.Desc
34+
nflogMaintenanceErrorsTotal *prometheus.Desc
3335

3436
// exported metrics, gathered from Alertmanager Marker
3537
markerAlerts *prometheus.Desc
@@ -43,6 +45,8 @@ type alertmanagerMetrics struct {
4345
silencesQueryDuration *prometheus.Desc
4446
silences *prometheus.Desc
4547
silencesPropagatedMessagesTotal *prometheus.Desc
48+
silencesMaintenanceTotal *prometheus.Desc
49+
silencesMaintenanceErrorsTotal *prometheus.Desc
4650

4751
// The alertmanager config hash.
4852
configHashValue *prometheus.Desc
@@ -127,6 +131,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
127131
"cortex_alertmanager_nflog_gossip_messages_propagated_total",
128132
"Number of received gossip messages that have been further gossiped.",
129133
nil, nil),
134+
nflogMaintenanceTotal: prometheus.NewDesc(
135+
"cortex_alertmanager_nflog_maintenance_total",
136+
"How many maintenances were executed for the notification log.",
137+
nil, nil),
138+
nflogMaintenanceErrorsTotal: prometheus.NewDesc(
139+
"cortex_alertmanager_nflog_maintenance_errors_total",
140+
"How many maintenances were executed for the notification log that failed.",
141+
nil, nil),
130142
markerAlerts: prometheus.NewDesc(
131143
"cortex_alertmanager_alerts",
132144
"How many alerts by state.",
@@ -163,6 +175,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
163175
"cortex_alertmanager_silences",
164176
"How many silences by state.",
165177
[]string{"user", "state"}, nil),
178+
silencesMaintenanceTotal: prometheus.NewDesc(
179+
"cortex_alertmanager_silences_maintenance_total",
180+
"How many maintenances were executed for silences.",
181+
nil, nil),
182+
silencesMaintenanceErrorsTotal: prometheus.NewDesc(
183+
"cortex_alertmanager_silences_maintenance_errors_total",
184+
"How many maintenances were executed for silences that failed.",
185+
nil, nil),
166186
configHashValue: prometheus.NewDesc(
167187
"cortex_alertmanager_config_hash",
168188
"Hash of the currently loaded alertmanager configuration.",
@@ -268,6 +288,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
268288
out <- m.nflogQueryErrorsTotal
269289
out <- m.nflogQueryDuration
270290
out <- m.nflogPropagatedMessagesTotal
291+
out <- m.nflogMaintenanceTotal
292+
out <- m.nflogMaintenanceErrorsTotal
271293
out <- m.silencesGCDuration
272294
out <- m.silencesSnapshotDuration
273295
out <- m.silencesSnapshotSize
@@ -276,6 +298,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
276298
out <- m.silencesQueryDuration
277299
out <- m.silencesPropagatedMessagesTotal
278300
out <- m.silences
301+
out <- m.silencesMaintenanceTotal
302+
out <- m.silencesMaintenanceErrorsTotal
279303
out <- m.configHashValue
280304
out <- m.partialMerges
281305
out <- m.partialMergesFailed
@@ -317,6 +341,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
317341
data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
318342
data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
319343
data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
344+
data.SendSumOfCounters(out, m.nflogMaintenanceTotal, "alertmanager_nflog_maintenance_total")
345+
data.SendSumOfCounters(out, m.nflogMaintenanceErrorsTotal, "alertmanager_nflog_maintenance_errors_total")
320346

321347
data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
322348
data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
@@ -326,6 +352,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
326352
data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
327353
data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
328354
data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state")
355+
data.SendSumOfCounters(out, m.silencesMaintenanceTotal, "alertmanager_silences_maintenance_total")
356+
data.SendSumOfCounters(out, m.silencesMaintenanceErrorsTotal, "alertmanager_silences_maintenance_errors_total")
329357

330358
data.SendMaxOfGaugesPerUser(out, m.configHashValue, "alertmanager_config_hash")
331359

pkg/alertmanager/alertmanager_metrics_test.go

Lines changed: 75 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
104104
# HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes.
105105
# TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge
106106
cortex_alertmanager_nflog_snapshot_size_bytes 111
107+
# HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
108+
# TYPE cortex_alertmanager_nflog_maintenance_total counter
109+
cortex_alertmanager_nflog_maintenance_total 111
110+
# HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
111+
# TYPE cortex_alertmanager_nflog_maintenance_errors_total counter
112+
cortex_alertmanager_nflog_maintenance_errors_total 111
107113
# HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds.
108114
# TYPE cortex_alertmanager_notification_latency_seconds histogram
109115
cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14
@@ -277,6 +283,12 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
277283
# HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes.
278284
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
279285
cortex_alertmanager_silences_snapshot_size_bytes 111
286+
# HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences.
287+
# TYPE cortex_alertmanager_silences_maintenance_total counter
288+
cortex_alertmanager_silences_maintenance_total 111
289+
# HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
290+
# TYPE cortex_alertmanager_silences_maintenance_errors_total counter
291+
cortex_alertmanager_silences_maintenance_errors_total 111
280292
# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
281293
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
282294
cortex_alertmanager_state_fetch_replica_state_failed_total 0
@@ -414,6 +426,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
414426
# TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge
415427
cortex_alertmanager_nflog_snapshot_size_bytes 111
416428
429+
# HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
430+
# TYPE cortex_alertmanager_nflog_maintenance_total counter
431+
cortex_alertmanager_nflog_maintenance_total 111
432+
# HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
433+
# TYPE cortex_alertmanager_nflog_maintenance_errors_total counter
434+
cortex_alertmanager_nflog_maintenance_errors_total 111
435+
417436
# HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds.
418437
# TYPE cortex_alertmanager_notification_latency_seconds histogram
419438
cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14
@@ -598,6 +617,14 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
598617
# HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes.
599618
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
600619
cortex_alertmanager_silences_snapshot_size_bytes 111
620+
621+
# HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences.
622+
# TYPE cortex_alertmanager_silences_maintenance_total counter
623+
cortex_alertmanager_silences_maintenance_total 111
624+
# HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
625+
# TYPE cortex_alertmanager_silences_maintenance_errors_total counter
626+
cortex_alertmanager_silences_maintenance_errors_total 111
627+
601628
# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
602629
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
603630
cortex_alertmanager_state_fetch_replica_state_failed_total 0
@@ -715,6 +742,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
715742
# TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge
716743
cortex_alertmanager_nflog_snapshot_size_bytes 11
717744
745+
# HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
746+
# TYPE cortex_alertmanager_nflog_maintenance_total counter
747+
cortex_alertmanager_nflog_maintenance_total 111
748+
# HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
749+
# TYPE cortex_alertmanager_nflog_maintenance_errors_total counter
750+
cortex_alertmanager_nflog_maintenance_errors_total 111
751+
718752
# HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds.
719753
# TYPE cortex_alertmanager_notification_latency_seconds histogram
720754
cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14
@@ -863,6 +897,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
863897
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
864898
cortex_alertmanager_silences_snapshot_size_bytes 11
865899
900+
# HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences.
901+
# TYPE cortex_alertmanager_silences_maintenance_total counter
902+
cortex_alertmanager_silences_maintenance_total 111
903+
# HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
904+
# TYPE cortex_alertmanager_silences_maintenance_errors_total counter
905+
cortex_alertmanager_silences_maintenance_errors_total 111
906+
866907
# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
867908
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
868909
cortex_alertmanager_state_fetch_replica_state_failed_total 0
@@ -913,6 +954,8 @@ func populateAlertmanager(base float64) *prometheus.Registry {
913954
s.silencesActive.Set(base)
914955
s.silencesExpired.Set(base * 2)
915956
s.silencesPending.Set(base * 3)
957+
s.silencesMaintenanceTotal.Add(base)
958+
s.silencesMaintenanceErrorsTotal.Add(base)
916959

917960
n := newNflogMetrics(reg)
918961
n.gcDuration.Observe(base)
@@ -922,6 +965,8 @@ func populateAlertmanager(base float64) *prometheus.Registry {
922965
n.queryErrorsTotal.Add(base)
923966
n.queryDuration.Observe(base)
924967
n.propagatedMessagesTotal.Add(base)
968+
n.maintenanceTotal.Add(base)
969+
n.maintenanceErrorsTotal.Add(base)
925970

926971
nm := newNotifyMetrics(reg)
927972
for i, integration := range integrations {
@@ -967,6 +1012,8 @@ type nflogMetrics struct {
9671012
queryErrorsTotal prometheus.Counter
9681013
queryDuration prometheus.Histogram
9691014
propagatedMessagesTotal prometheus.Counter
1015+
maintenanceTotal prometheus.Counter
1016+
maintenanceErrorsTotal prometheus.Counter
9701017
}
9711018

9721019
func newNflogMetrics(r prometheus.Registerer) *nflogMetrics {
@@ -1002,22 +1049,32 @@ func newNflogMetrics(r prometheus.Registerer) *nflogMetrics {
10021049
Name: "alertmanager_nflog_gossip_messages_propagated_total",
10031050
Help: "Number of received gossip messages that have been further gossiped.",
10041051
})
1052+
m.maintenanceTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
1053+
Name: "alertmanager_nflog_maintenance_total",
1054+
Help: "How many maintenances were executed for the notification log.",
1055+
})
1056+
m.maintenanceErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
1057+
Name: "alertmanager_nflog_maintenance_errors_total",
1058+
Help: "How many maintenances were executed for the notification log that failed.",
1059+
})
10051060

10061061
return m
10071062
}
10081063

10091064
// Copied from github.com/alertmanager/silence/silence.go
10101065
type silenceMetrics struct {
1011-
gcDuration prometheus.Summary
1012-
snapshotDuration prometheus.Summary
1013-
snapshotSize prometheus.Gauge
1014-
queriesTotal prometheus.Counter
1015-
queryErrorsTotal prometheus.Counter
1016-
queryDuration prometheus.Histogram
1017-
silencesActive prometheus.Gauge
1018-
silencesPending prometheus.Gauge
1019-
silencesExpired prometheus.Gauge
1020-
propagatedMessagesTotal prometheus.Counter
1066+
gcDuration prometheus.Summary
1067+
snapshotDuration prometheus.Summary
1068+
snapshotSize prometheus.Gauge
1069+
queriesTotal prometheus.Counter
1070+
queryErrorsTotal prometheus.Counter
1071+
queryDuration prometheus.Histogram
1072+
silencesActive prometheus.Gauge
1073+
silencesPending prometheus.Gauge
1074+
silencesExpired prometheus.Gauge
1075+
propagatedMessagesTotal prometheus.Counter
1076+
silencesMaintenanceTotal prometheus.Counter
1077+
silencesMaintenanceErrorsTotal prometheus.Counter
10211078
}
10221079

10231080
func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics {
@@ -1068,6 +1125,14 @@ func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics {
10681125
Help: "How many silences by state.",
10691126
ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)},
10701127
})
1128+
m.silencesMaintenanceTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
1129+
Name: "alertmanager_silences_maintenance_total",
1130+
Help: "How many maintenances were executed for silences.",
1131+
})
1132+
m.silencesMaintenanceErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
1133+
Name: "alertmanager_silences_maintenance_errors_total",
1134+
Help: "How many maintenances were executed for silences that failed.",
1135+
})
10711136

10721137
return m
10731138
}

0 commit comments

Comments
 (0)