Skip to content

Commit e01c5ce

Browse files
committed
notifier: fix increment of metric prometheus_notifications_errors_total
Previously, prometheus_notifications_errors_total was incremented by one whenever a batch of alerts was affected by an error during sending to a specific alertmanager. However, the corresponding metric prometheus_notifications_sent_total, counting all alerts that were sent (including those where the sent ended in error), is incremented by the batch size, i.e. the number of alerts. Therefore, the ratio used in the mixin for the PrometheusErrorSendingAlertsToSomeAlertmanagers alert is inconsistent. This commit changes the increment of prometheus_notifications_errors_total to the number of alerts that were sent in the attempt that ended in an error. It also adjusts the metrics help string accordingly and makes the wording in the alert in the mixin more precise. Signed-off-by: beorn7 <[email protected]>
1 parent a6fb16f commit e01c5ce

File tree

3 files changed

+7
-6
lines changed

3 files changed

+7
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## unreleased
44

5+
* [CHANGE] Notifier: Increment the prometheus_notifications_errors_total metric by the number of affected alerts rather than by one per batch of affected alerts. #15428
56
* [ENHANCEMENT] OTLP receiver: Convert also metric metadata. #15416
67

78
## 3.0.0 / 2024-11-14

documentation/prometheus-mixin/alerts.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@
8484
severity: 'warning',
8585
},
8686
annotations: {
87-
summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.',
88-
description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
87+
summary: 'More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors.',
88+
description: '{{ printf "%%.1f" $value }}%% of alerts sent by Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}} were affected by errors.' % $._config,
8989
},
9090
},
9191
{

notifier/notifier.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ func newAlertMetrics(r prometheus.Registerer, queueCap int, queueLen, alertmanag
160160
Namespace: namespace,
161161
Subsystem: subsystem,
162162
Name: "errors_total",
163-
Help: "Total number of errors sending alert notifications.",
163+
Help: "Total number of sent alerts affected by errors.",
164164
},
165165
[]string{alertmanagerLabel},
166166
),
@@ -619,13 +619,13 @@ func (n *Manager) sendAll(alerts ...*Alert) bool {
619619

620620
go func(ctx context.Context, client *http.Client, url string, payload []byte, count int) {
621621
if err := n.sendOne(ctx, client, url, payload); err != nil {
622-
n.logger.Error("Error sending alert", "alertmanager", url, "count", count, "err", err)
623-
n.metrics.errors.WithLabelValues(url).Inc()
622+
n.logger.Error("Error sending alerts", "alertmanager", url, "count", count, "err", err)
623+
n.metrics.errors.WithLabelValues(url).Add(float64(count))
624624
} else {
625625
numSuccess.Inc()
626626
}
627627
n.metrics.latency.WithLabelValues(url).Observe(time.Since(begin).Seconds())
628-
n.metrics.sent.WithLabelValues(url).Add(float64(len(amAlerts)))
628+
n.metrics.sent.WithLabelValues(url).Add(float64(count))
629629

630630
wg.Done()
631631
}(ctx, ams.client, am.url().String(), payload, len(amAlerts))

0 commit comments

Comments
 (0)