Skip to content
This repository was archived by the owner on Apr 30, 2025. It is now read-only.

Commit caf8d6b

Browse files
b1tamaramaxmoehlmike-jc
authored andcommitted
Report metrics via prometheus
co-authored-by: Maximilian Moehl <[email protected]> co-authored-by: Mykhailo Yeromko <[email protected]>
1 parent 47ead2d commit caf8d6b

14 files changed

+1085
-40
lines changed

config/config.go

+28-4
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,28 @@ var defaultStatusConfig = StatusConfig{
112112
}
113113

114114
type PrometheusConfig struct {
115-
Port uint16 `yaml:"port"`
116-
CertPath string `yaml:"cert_path"`
117-
KeyPath string `yaml:"key_path"`
118-
CAPath string `yaml:"ca_path"`
115+
Enabled bool `yaml:"enabled,omitempty"`
116+
Port uint16 `yaml:"port"`
117+
CertPath string `yaml:"cert_path"`
118+
KeyPath string `yaml:"key_path"`
119+
CAPath string `yaml:"ca_path"`
120+
Meters MetersConfig `yaml:"meters,omitempty"`
121+
}
122+
123+
var defaultPrometheusConfig = PrometheusConfig{
124+
Meters: defaultMetersConfig,
125+
}
126+
127+
type MetersConfig struct {
128+
RouteLookupTimeHistogramBuckets []float64 `yaml:"route_lookup_time_histogram_buckets,omitempty"`
129+
RouteRegistrationLatencyHistogramBuckets []float64 `yaml:"route_registration_latency_histogram_buckets,omitempty"`
130+
RoutingResponseLatencyHistogramBuckets []float64 `yaml:"routing_response_latency_histogram_buckets,omitempty"`
131+
}
132+
133+
var defaultMetersConfig = MetersConfig{
134+
RouteLookupTimeHistogramBuckets: []float64{10_000, 20_000, 30_000, 40_000, 50_000, 60_000, 70_000, 80_000, 90_000, 100_000},
135+
RouteRegistrationLatencyHistogramBuckets: []float64{0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4},
136+
RoutingResponseLatencyHistogramBuckets: []float64{1, 2, 4, 6, 8, 10, 20, 40, 50, 100, 500, 1000},
119137
}
120138

121139
type NatsConfig struct {
@@ -479,6 +497,9 @@ type Config struct {
479497
// reports latency under gorouter sourceid, and with and without component name
480498
PerRequestMetricsReporting bool `yaml:"per_request_metrics_reporting,omitempty"`
481499

500+
// Switch to disable old metrics reporting using Envelope v1
501+
EnableEnvelopeV1Metrics bool `yaml:"enable_envelope_v1_metrics"`
502+
482503
// Old metric, to eventually be replaced by prometheus reporting
483504
SendHttpStartStopServerEvent bool `yaml:"send_http_start_stop_server_event,omitempty"`
484505

@@ -496,6 +517,7 @@ var defaultConfig = Config{
496517
Nats: defaultNatsConfig,
497518
Logging: defaultLoggingConfig,
498519
Port: 8081,
520+
Prometheus: defaultPrometheusConfig,
499521
Index: 0,
500522
GoMaxProcs: -1,
501523
EnablePROXY: false,
@@ -550,6 +572,8 @@ var defaultConfig = Config{
550572

551573
PerRequestMetricsReporting: true,
552574

575+
EnableEnvelopeV1Metrics: true,
576+
553577
SendHttpStartStopServerEvent: true,
554578

555579
SendHttpStartStopClientEvent: true,

config/config_test.go

+35-1
Original file line numberDiff line numberDiff line change
@@ -223,21 +223,44 @@ max_request_header_bytes: 10
223223
})
224224

225225
It("sets prometheus endpoint config", func() {
226+
cfg, err := DefaultConfig()
227+
Expect(err).ToNot(HaveOccurred())
228+
226229
var b = []byte(`
227230
prometheus:
231+
enabled: true
228232
port: 1234
229233
cert_path: /some-cert-path
230234
key_path: /some-key-path
231235
ca_path: /some-ca-path
232236
`)
233237

234-
err := config.Initialize(b)
238+
err = config.Initialize(b)
235239
Expect(err).ToNot(HaveOccurred())
236240

241+
Expect(config.Prometheus.Enabled).To(BeTrue())
237242
Expect(config.Prometheus.Port).To(Equal(uint16(1234)))
238243
Expect(config.Prometheus.CertPath).To(Equal("/some-cert-path"))
239244
Expect(config.Prometheus.KeyPath).To(Equal("/some-key-path"))
240245
Expect(config.Prometheus.CAPath).To(Equal("/some-ca-path"))
246+
Expect(config.Prometheus.Meters).To(Equal(cfg.Prometheus.Meters))
247+
})
248+
249+
It("sets prometheus histogram buckets config", func() {
250+
var b = []byte(`
251+
prometheus:
252+
meters:
253+
route_lookup_time_histogram_buckets: [0, 100, 10000]
254+
route_registration_latency_histogram_buckets: [-10, 0, 10]
255+
routing_response_latency_histogram_buckets: [0.1, 0.5, 1]
256+
`)
257+
258+
err := config.Initialize(b)
259+
Expect(err).ToNot(HaveOccurred())
260+
261+
Expect(config.Prometheus.Meters.RouteLookupTimeHistogramBuckets).To(Equal([]float64{0, 100, 10000}))
262+
Expect(config.Prometheus.Meters.RouteRegistrationLatencyHistogramBuckets).To(Equal([]float64{-10, 0, 10}))
263+
Expect(config.Prometheus.Meters.RoutingResponseLatencyHistogramBuckets).To(Equal([]float64{0.1, 0.5, 1}))
241264
})
242265

243266
It("defaults frontend idle timeout to 900", func() {
@@ -921,6 +944,17 @@ backends:
921944
Expect(config.PerRequestMetricsReporting).To(BeFalse())
922945
})
923946

947+
It("defaults EnableEnvelopeV1Metrics to true", func() {
948+
Expect(config.EnableEnvelopeV1Metrics).To(Equal(true))
949+
})
950+
951+
It("sets EnableEnvelopeV1Metrics", func() {
952+
var b = []byte(`enable_envelope_v1_metrics: false`)
953+
err := config.Initialize(b)
954+
Expect(err).ToNot(HaveOccurred())
955+
Expect(config.EnableEnvelopeV1Metrics).To(BeFalse())
956+
})
957+
924958
It("defaults SendHttpStartStopServerEvent to true", func() {
925959
Expect(config.SendHttpStartStopServerEvent).To(Equal(true))
926960
})

docs/04-observability.md

+171-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ status:
8080
pass: some_password
8181
```
8282
83-
### Metrics
83+
### Metrics (Envelope v1)
8484
8585
The `/varz` endpoint provides status and metrics. This endpoint requires basic
8686
authentication.
@@ -238,6 +238,176 @@ $ curl "http://someuser:[email protected]:8080/varz"
238238

239239
</details>
240240

241+
### Metrics (Prometheus)
242+
243+
The `<HOST>:<PROMETHEUS_PORT>/metrics` endpoint provides prometheus metrics.
244+
245+
Prometheus port, as well as certificates and other parameters, are configured in gorouter configuration.
246+
247+
<details>
248+
<summary>Metrics response (click to expand)</summary>
249+
250+
```bash
251+
$ curl localhost:8042/metrics
252+
# HELP backend_exhausted_conns number of errors related to backend connection limit reached
253+
# TYPE backend_exhausted_conns counter
254+
backend_exhausted_conns 0
255+
# HELP backend_invalid_id number of bad backend id errors received from backends
256+
# TYPE backend_invalid_id counter
257+
backend_invalid_id 0
258+
# HELP backend_invalid_tls_cert number of tls certificate errors received from backends
259+
# TYPE backend_invalid_tls_cert counter
260+
backend_invalid_tls_cert 0
261+
# HELP backend_tls_handshake_failed number of backend handshake errors
262+
# TYPE backend_tls_handshake_failed counter
263+
backend_tls_handshake_failed 0
264+
# HELP bad_gateways number of bad gateway errors received from backends
265+
# TYPE bad_gateways counter
266+
bad_gateways 0
267+
# HELP empty_content_length_header number of requests with the empty content length header
268+
# TYPE empty_content_length_header counter
269+
empty_content_length_header 0
270+
# HELP latency routing response latency in ms
271+
# TYPE latency histogram
272+
latency_bucket{component="",le="1"} 2
273+
latency_bucket{component="",le="2"} 8705
274+
latency_bucket{component="",le="4"} 55939
275+
latency_bucket{component="",le="6"} 73786
276+
latency_bucket{component="",le="8"} 79800
277+
latency_bucket{component="",le="10"} 81911
278+
latency_bucket{component="",le="20"} 86670
279+
latency_bucket{component="",le="40"} 95701
280+
latency_bucket{component="",le="50"} 100139
281+
latency_bucket{component="",le="100"} 102135
282+
latency_bucket{component="",le="500"} 104114
283+
latency_bucket{component="",le="1000"} 104518
284+
latency_bucket{component="",le="+Inf"} 104519
285+
latency_sum{component=""} 1.5821502875950185e+06
286+
latency_count{component=""} 104519
287+
latency_bucket{component="CloudController",le="1"} 0
288+
latency_bucket{component="CloudController",le="2"} 0
289+
latency_bucket{component="CloudController",le="4"} 7
290+
latency_bucket{component="CloudController",le="6"} 3928
291+
latency_bucket{component="CloudController",le="8"} 5142
292+
latency_bucket{component="CloudController",le="10"} 5606
293+
latency_bucket{component="CloudController",le="20"} 22662
294+
latency_bucket{component="CloudController",le="40"} 75564
295+
latency_bucket{component="CloudController",le="50"} 82497
296+
latency_bucket{component="CloudController",le="100"} 87240
297+
latency_bucket{component="CloudController",le="500"} 91080
298+
latency_bucket{component="CloudController",le="1000"} 91566
299+
latency_bucket{component="CloudController",le="+Inf"} 91579
300+
latency_sum{component="CloudController"} 3.744289433603952e+06
301+
latency_count{component="CloudController"} 91579
302+
latency_bucket{component="route-emitter",le="1"} 0
303+
latency_bucket{component="route-emitter",le="2"} 337
304+
latency_bucket{component="route-emitter",le="4"} 448
305+
latency_bucket{component="route-emitter",le="6"} 462
306+
latency_bucket{component="route-emitter",le="8"} 1241
307+
latency_bucket{component="route-emitter",le="10"} 31250
308+
latency_bucket{component="route-emitter",le="20"} 75412
309+
latency_bucket{component="route-emitter",le="40"} 75695
310+
latency_bucket{component="route-emitter",le="50"} 75711
311+
latency_bucket{component="route-emitter",le="100"} 75717
312+
latency_bucket{component="route-emitter",le="500"} 75799
313+
latency_bucket{component="route-emitter",le="1000"} 75800
314+
latency_bucket{component="route-emitter",le="+Inf"} 75802
315+
latency_sum{component="route-emitter"} 808316.2385179874
316+
latency_count{component="route-emitter"} 75802
317+
latency_bucket{component="uaa",le="1"} 0
318+
latency_bucket{component="uaa",le="2"} 1
319+
latency_bucket{component="uaa",le="4"} 5951
320+
latency_bucket{component="uaa",le="6"} 64736
321+
latency_bucket{component="uaa",le="8"} 83650
322+
latency_bucket{component="uaa",le="10"} 90136
323+
latency_bucket{component="uaa",le="20"} 101690
324+
latency_bucket{component="uaa",le="40"} 102816
325+
latency_bucket{component="uaa",le="50"} 102868
326+
latency_bucket{component="uaa",le="100"} 106615
327+
latency_bucket{component="uaa",le="500"} 119313
328+
latency_bucket{component="uaa",le="1000"} 119325
329+
latency_bucket{component="uaa",le="+Inf"} 119328
330+
latency_sum{component="uaa"} 2.5461252577970154e+06
331+
latency_count{component="uaa"} 119328
332+
# HELP ms_since_last_registry_update time since last registry update in ms
333+
# TYPE ms_since_last_registry_update gauge
334+
ms_since_last_registry_update 3942
335+
# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler.
336+
# TYPE promhttp_metric_handler_errors_total counter
337+
promhttp_metric_handler_errors_total{cause="encoding"} 0
338+
promhttp_metric_handler_errors_total{cause="gathering"} 0
339+
# HELP registry_message number of route registration messages
340+
# TYPE registry_message counter
341+
registry_message{action="added",component=""} 16
342+
registry_message{action="added",component="CloudController"} 1
343+
registry_message{action="added",component="route-emitter"} 2079
344+
registry_message{action="added",component="uaa"} 4
345+
registry_message{action="updated",component=""} 290636
346+
registry_message{action="updated",component="CloudController"} 36330
347+
registry_message{action="updated",component="route-emitter"} 1.218307e+06
348+
registry_message{action="updated",component="uaa"} 145320
349+
# HELP rejected_requests number of rejected requests
350+
# TYPE rejected_requests counter
351+
rejected_requests 1
352+
# HELP responses number of responses
353+
# TYPE responses counter
354+
responses{status_group="2xx"} 312778
355+
responses{status_group="3xx"} 29409
356+
responses{status_group="4xx"} 49041
357+
# HELP route_lookup_time route lookup time per request in ns
358+
# TYPE route_lookup_time histogram
359+
route_lookup_time_bucket{le="10000"} 377295
360+
route_lookup_time_bucket{le="20000"} 389278
361+
route_lookup_time_bucket{le="30000"} 390445
362+
route_lookup_time_bucket{le="40000"} 391008
363+
route_lookup_time_bucket{le="50000"} 391261
364+
route_lookup_time_bucket{le="60000"} 391337
365+
route_lookup_time_bucket{le="70000"} 391378
366+
route_lookup_time_bucket{le="80000"} 391397
367+
route_lookup_time_bucket{le="90000"} 391417
368+
route_lookup_time_bucket{le="100000"} 391432
369+
route_lookup_time_bucket{le="+Inf"} 391525
370+
route_lookup_time_sum 1.584830859e+09
371+
route_lookup_time_count 391525
372+
# HELP route_registration_latency route registration latency in ms
373+
# TYPE route_registration_latency histogram
374+
route_registration_latency_bucket{le="0.1"} 0
375+
route_registration_latency_bucket{le="0.5"} 0
376+
route_registration_latency_bucket{le="1"} 0
377+
route_registration_latency_bucket{le="1.5"} 0
378+
route_registration_latency_bucket{le="2"} 2
379+
route_registration_latency_bucket{le="2.5"} 15
380+
route_registration_latency_bucket{le="3"} 152
381+
route_registration_latency_bucket{le="3.5"} 625
382+
route_registration_latency_bucket{le="4"} 1372
383+
route_registration_latency_bucket{le="+Inf"} 2012
384+
route_registration_latency_sum 7695.366220000008
385+
route_registration_latency_count 2012
386+
# HELP routes_pruned number of pruned routes
387+
# TYPE routes_pruned counter
388+
routes_pruned 0
389+
# HELP total_requests number of routing requests
390+
# TYPE total_requests counter
391+
total_requests{component="",is_routed_app="no"} 104419
392+
total_requests{component="CloudController",is_routed_app="no"} 91579
393+
total_requests{component="route-emitter",is_routed_app="no"} 75802
394+
total_requests{component="uaa",is_routed_app="no"} 119328
395+
# HELP total_routes number of total routes
396+
# TYPE total_routes gauge
397+
total_routes 54
398+
# HELP unregistry_message number of unregister messages
399+
# TYPE unregistry_message counter
400+
unregistry_message{component="route-emitter"} 2012
401+
# HELP websocket_failures websocket failure
402+
# TYPE websocket_failures counter
403+
websocket_failures 0
404+
# HELP websocket_upgrades websocket upgrade to websocket
405+
# TYPE websocket_upgrades counter
406+
websocket_upgrades 0
407+
```
408+
409+
</details>
410+
241411
### Profiling the Server
242412

243413
The Gorouter runs the

integration/main_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,7 @@ var _ = Describe("Router Integration", func() {
703703
metricsPort := test_util.NextAvailPort()
704704
serverCAPath, serverCertPath, serverKeyPath, clientCert := tls_helpers.GenerateCaAndMutualTlsCerts()
705705

706+
c.Prometheus.Enabled = true
706707
c.Prometheus.Port = metricsPort
707708
c.Prometheus.CertPath = serverCertPath
708709
c.Prometheus.KeyPath = serverKeyPath

0 commit comments

Comments
 (0)