add feature to configurate promhttp error handling

Nikita Popov · Nikita Popov · commit b5a9fc60dbad · 2025-02-10T14:33:51.000+02:00
diff --git a/README.md b/README.md
@@ -78,22 +78,23 @@ If you are still using the legacy [Access scopes][access-scopes], the `https://w
 
 | Flag                                | Required | Default                   | Description                                                                                                                                                                                       |
 | ----------------------------------- | -------- |---------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `google.project-ids`                 | No       | GCloud SDK auto-discovery | Repeatable flag of Google Project IDs                                                                                                                                                        |
-| `google.projects.filter`            | No       |                           | GCloud projects filter expression. See more [here](https://cloud.google.com/sdk/gcloud/reference/projects/list).                                                                                                                                                        |
+| `google.project-ids`                 | No       | GCloud SDK auto-discovery | Repeatable flag of Google Project IDs                                                                                                                                                             |
+| `google.projects.filter`            | No       |                           | GCloud projects filter expression. See more [here](https://cloud.google.com/sdk/gcloud/reference/projects/list).                                                                                  |
 | `monitoring.metrics-ingest-delay`   | No       |                           | Offsets metric collection by a delay appropriate for each metric type, e.g. because bigquery metrics are slow to appear                                                                           |
 | `monitoring.drop-delegated-projects` | No       | No                        | Drop metrics from attached projects and fetch `project_id` only.                                                                                                                                  |
-| `monitoring.metrics-prefixes`  | Yes      |                           | Repeatable flag of Google Stackdriver Monitoring Metric Type prefixes (see [example][metrics-prefix-example] and [available metrics][metrics-list])                                                  |
+| `monitoring.metrics-prefixes`  | Yes      |                           | Repeatable flag of Google Stackdriver Monitoring Metric Type prefixes (see [example][metrics-prefix-example] and [available metrics][metrics-list])                                               |
 | `monitoring.metrics-interval`       | No       | `5m`                      | Metric's timestamp interval to request from the Google Stackdriver Monitoring Metrics API. Only the most recent data point is used                                                                |
 | `monitoring.metrics-offset`         | No       | `0s`                      | Offset (into the past) for the metric's timestamp interval to request from the Google Stackdriver Monitoring Metrics API, to handle latency in published metrics                                  |
-| `monitoring.filters`                | No       |                           | Additonal filters to be sent on the Monitoring API call. Add multiple filters by providing this parameter multiple times. See [monitoring.filters](#using-filters) for more info. |
+| `monitoring.filters`                | No       |                           | Additonal filters to be sent on the Monitoring API call. Add multiple filters by providing this parameter multiple times. See [monitoring.filters](#using-filters) for more info.                 |
 | `monitoring.aggregate-deltas`       | No       |                           | If enabled will treat all DELTA metrics as an in-memory counter instead of a gauge. Be sure to read [what to know about aggregating DELTA metrics](#what-to-know-about-aggregating-delta-metrics) |
 | `monitoring.aggregate-deltas-ttl`   | No       | `30m`                     | How long should a delta metric continue to be exported and stored after GCP stops producing it. Read [slow moving metrics](#slow-moving-metrics) to understand the problem this attempts to solve |
 | `monitoring.descriptor-cache-ttl`   | No       | `0s`                      | How long should the metric descriptors for a prefixed be cached for                                                                                                                               |
+| `promhttp.error-handling`                | No       | `httpErrorOnError`        | Defines how errors are handled by promhttp.Handler while serving metrics. Possible values: `httpErrorOnError`, `continueOnError`, `panicOnError` are mapped to [available options][promhttp-error-handling-opts] |
 | `stackdriver.max-retries`           | No       | `0`                       | Max number of retries that should be attempted on 503 errors from stackdriver.                                                                                                                    |
-| `stackdriver.http-timeout`          | No       | `10s`                     |  How long should stackdriver_exporter wait for a result from the Stackdriver API.                                                                                                                 |
+| `stackdriver.http-timeout`          | No       | `10s`                     | How long should stackdriver_exporter wait for a result from the Stackdriver API.                                                                                                                  |
 | `stackdriver.max-backoff=`          | No       |                           | Max time between each request in an exp backoff scenario.                                                                                                                                         |
-| `stackdriver.backoff-jitter`        | No       | `1s`                       | The amount of jitter to introduce in a exp backoff scenario.                                                                                                                                      |
-| `stackdriver.retry-statuses`        | No       | `503`                     |  The HTTP statuses that should trigger a retry.                                                                                                                                                   |
+| `stackdriver.backoff-jitter`        | No       | `1s`                      | The amount of jitter to introduce in a exp backoff scenario.                                                                                                                                      |
+| `stackdriver.retry-statuses`        | No       | `503`                     | The HTTP statuses that should trigger a retry.                                                                                                                                                    |
 | `web.config.file`                   | No       |                           | [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication.                                                                                                                  |
 | `web.listen-address`                | No       | `:9255`                   | Address to listen on for web interface and telemetry Repeatable for multiple addresses.                                                                                                           |
 | `web.systemd-socket`                | No       |                           | Use systemd socket activation listeners instead of port listeners (Linux only).                                                                                                                   |
@@ -247,4 +248,5 @@ Apache License 2.0, see [LICENSE][license].
 [monitored-resources]: https://cloud.google.com/monitoring/api/resources
 [prometheus]: https://prometheus.io/
 [prometheus-boshrelease]: https://github.com/cloudfoundry-community/prometheus-boshrelease
+[promhttp-error-handling-opts]: https://github.com/prometheus/client_golang/blob/main/prometheus/promhttp/http.go#L323
 [stackdriver]: https://cloud.google.com/monitoring/
diff --git a/stackdriver_exporter.go b/stackdriver_exporter.go
@@ -137,6 +137,10 @@ var (
 	monitoringDescriptorCacheOnlyGoogle = kingpin.Flag(
 		"monitoring.descriptor-cache-only-google", "Only cache descriptors for *.googleapis.com metrics",
 	).Default("true").Bool()
+
+	promHttpErrorHandling = kingpin.Flag(
+		"promhttp.error-handling", "Defines how errors are handled by promhttp.Handler while serving metrics",
+	).Default("httpErrorOnError").Enum("httpErrorOnError", "continueOnError", "panicOnError")
 )
 
 func init() {
@@ -277,7 +281,10 @@ func (h *handler) innerHandler(filters map[string]bool) http.Handler {
 			registry,
 		}
 	}
-	opts := promhttp.HandlerOpts{ErrorLog: slog.NewLogLogger(h.logger.Handler(), slog.LevelError)}
+	opts := promhttp.HandlerOpts{
+		ErrorLog:      slog.NewLogLogger(h.logger.Handler(), slog.LevelError),
+		ErrorHandling: getPromHttpErrorHandlingOpt(*promHttpErrorHandling),
+	}
 	// Delegate http serving to Prometheus client library, which will call collector.Collect.
 	return promhttp.HandlerFor(gatherers, opts)
 }
@@ -464,3 +471,13 @@ func parseMetricExtraFilters() []collectors.MetricFilter {
 	}
 	return extraFilters
 }
+
+func getPromHttpErrorHandlingOpt(flagOpt string) promhttp.HandlerErrorHandling {
+	if flagOpt == "continueOnError" {
+		return promhttp.ContinueOnError
+	}
+	if flagOpt == "panicOnError" {
+		return promhttp.PanicOnError
+	}
+	return promhttp.HTTPErrorOnError
+}

Original file line number	Diff line number	Diff line change
`@@ -137,6 +137,10 @@ var (`
`137`	`137`	`monitoringDescriptorCacheOnlyGoogle = kingpin.Flag(`
`138`	`138`	`"monitoring.descriptor-cache-only-google", "Only cache descriptors for *.googleapis.com metrics",`
`139`	`139`	`).Default("true").Bool()`
	`140`	`+`
	`141`	`+ promHttpErrorHandling = kingpin.Flag(`
	`142`	`+ "promhttp.error-handling", "Defines how errors are handled by promhttp.Handler while serving metrics",`
	`143`	`+ ).Default("httpErrorOnError").Enum("httpErrorOnError", "continueOnError", "panicOnError")`
`140`	`144`	`)`
`141`	`145`
`142`	`146`	`func init() {`
`@@ -277,7 +281,10 @@ func (h *handler) innerHandler(filters map[string]bool) http.Handler {`
`277`	`281`	`registry,`
`278`	`282`	`}`
`279`	`283`	`}`
`280`		`- opts := promhttp.HandlerOpts{ErrorLog: slog.NewLogLogger(h.logger.Handler(), slog.LevelError)}`
	`284`	`+ opts := promhttp.HandlerOpts{`
	`285`	`+ ErrorLog: slog.NewLogLogger(h.logger.Handler(), slog.LevelError),`
	`286`	`+ ErrorHandling: getPromHttpErrorHandlingOpt(*promHttpErrorHandling),`
	`287`	`+ }`
`281`	`288`	`// Delegate http serving to Prometheus client library, which will call collector.Collect.`
`282`	`289`	`return promhttp.HandlerFor(gatherers, opts)`
`283`	`290`	`}`
`@@ -464,3 +471,13 @@ func parseMetricExtraFilters() []collectors.MetricFilter {`
`464`	`471`	`}`
`465`	`472`	`return extraFilters`
`466`	`473`	`}`
	`474`	`+`
	`475`	`+func getPromHttpErrorHandlingOpt(flagOpt string) promhttp.HandlerErrorHandling {`
	`476`	`+ if flagOpt == "continueOnError" {`
	`477`	`+ return promhttp.ContinueOnError`
	`478`	`+ }`
	`479`	`+ if flagOpt == "panicOnError" {`
	`480`	`+ return promhttp.PanicOnError`
	`481`	`+ }`
	`482`	`+ return promhttp.HTTPErrorOnError`
	`483`	`+}`