Skip to content

Commit

Permalink
Remove status metrics for deleted resources (#10597)
Browse files Browse the repository at this point in the history
Co-authored-by: Nathan Fudenberg <[email protected]>
Co-authored-by: changelog-bot <changelog-bot>
  • Loading branch information
ryanrolds and nfuden authored Feb 13, 2025
1 parent e1a5574 commit 0719073
Show file tree
Hide file tree
Showing 25 changed files with 520 additions and 27 deletions.
23 changes: 15 additions & 8 deletions .github/workflows/pr-kubernetes-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,40 +55,47 @@ jobs:
# NOTE: We use the GitHub action step time (as opposed to the `go test` time), because it is easier to capture

test:
# Dec 4, 2024: 22 minutes
# 2024-12-04: 22m
# 2025-02-13: 29m3s
- cluster-name: 'cluster-one'
go-test-args: '-v -timeout=25m'
go-test-run-regex: '^TestK8sGateway$$/^RouteDelegation$$|^TestGlooctlGlooGatewayEdgeGateway$$|^TestGlooctlK8sGateway$$'

# Dec 4, 2024: 23 minutes
# 2024-12-04: 23m
# 2025-02-13: 30m30s
- cluster-name: 'cluster-two'
go-test-args: '-v -timeout=25m'
go-test-run-regex: '^TestK8sGatewayIstioRevision$$|^TestRevisionIstioRegression$$|^TestK8sGateway$$/^Deployer$$|^TestK8sGateway$$/^RouteOptions$$|^TestK8sGateway$$/^VirtualHostOptions$$|^TestK8sGateway$$/^Upstreams$$|^TestK8sGateway$$/^HeadlessSvc$$|^TestK8sGateway$$/^PortRouting$$|^TestK8sGatewayMinimalDefaultGatewayParameters$$|^TestK8sGateway$$/^DirectResponse$$|^TestK8sGateway$$/^HttpListenerOptions$$|^TestK8sGateway$$/^ListenerOptions$$|^TestK8sGateway$$/^GlooAdminServer$$'

# Dec 4, 2024: 24 minutes
# 2024-12-04: 24m
# 2025-02-13: 31m49s
- cluster-name: 'cluster-three'
go-test-args: '-v -timeout=30m'
go-test-run-regex: '(^TestK8sGatewayIstioAutoMtls$$|^TestAutomtlsIstioEdgeApisGateway$$|^TestIstioEdgeApiGateway$$|^TestIstioRegression$$)'

# Dec 4, 2024: 21 minutes
# 2024-12-04: 21m
# 2025-02-13: 28m3s
- cluster-name: 'cluster-four'
go-test-args: '-v -timeout=30m'
go-test-run-regex: '(^TestK8sGatewayIstio$$|^TestGlooGatewayEdgeGateway$$|^TestGlooctlIstioInjectEdgeApiGateway$$)'

# Dec 4, 2024: 24 minutes
# 2024-12-04: 24m
# 2025-02-13: 35m21s
- cluster-name: 'cluster-five'
go-test-args: '-v -timeout=30m'
go-test-run-regex: '^TestFullEnvoyValidation$$|^TestValidationStrict$$|^TestValidationAlwaysAccept$$|^TestTransformationValidationDisabled$$'

# Dec 4, 2024: 26 minutes
# 2024-12-04: 26m
# 2025-02-13: 33m19s
- cluster-name: 'cluster-six'
go-test-args: '-v -timeout=30m'
go-test-run-regex: '^TestDiscoveryWatchlabels$$|^TestK8sGatewayNoValidation$$|^TestHelm$$|^TestHelmSettings$$|^TestK8sGatewayAws$$|^TestK8sGateway$$/^HTTPRouteServices$$|^TestK8sGateway$$/^TCPRouteServices$$'

# Dec 4, 2024: 16 minutes
# 2024-12-04: 16m
# 2025-02-13: 26m29s
- cluster-name: 'cluster-seven'
go-test-args: '-v -timeout=25m'
go-test-run-regex: '^TestK8sGateway$$/^CRDCategories$$|^TestK8sGateway$$/^Metrics$$|^TestGloomtlsGatewayEdgeGateway$$|^TestGloomtlsGatewayK8sGateway$$|^TestWatchNamespaceSelector$$'
go-test-run-regex: '^TestK8sGateway$$/^CRDCategories$$|^TestK8sGateway$$/^Metrics$$|^TestGloomtlsGatewayEdgeGateway$$|^TestGloomtlsGatewayK8sGateway$$|^TestGlooGatewayEdgeGatewayClearMetrics$$|^TestWatchNamespaceSelector$$'

# In our PR tests, we run the suite of tests using the upper ends of versions that we claim to support
# The versions should mirror: https://docs.solo.io/gloo-edge/latest/reference/support/
Expand Down
12 changes: 12 additions & 0 deletions changelog/v1.19.0-beta8/remove-metrics-for-deleted-resources.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
changelog:
- type: FIX
description: |
Added the ability to control if resource status metrics are no longer reported after deletion.
If a resource was invalid and deleted, a status metric indicating a problem was left behind.
This could lead to confusion and false alarms.
Setting `.Values.gloo.clearStatusMetrics` to `true` will result in metrics
for deleted resources no longer being reported.
This may cause metric scraping to infrequently not see status metrics.
issueLink: https://github.com/kgateway-dev/kgateway/issues/6938
resolvesIssue: false
1 change: 1 addition & 0 deletions docs/content/reference/values.txt
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@
|gloo.headerSecretRefNsMatchesUs|bool||Set to true to require that secrets sent in headers via headerSecretRefs come from the same namespace as the destination upstream. Default: false|
|gloo.podDisruptionBudget.minAvailable|string||Corresponds directly with the _minAvailable_ field in the [PodDisruptionBudgetSpec](https://kubernetes.io/docs/reference/kubernetes-api/policy-resources/pod-disruption-budget-v1/#PodDisruptionBudgetSpec). This value is mutually exclusive with _maxUnavailable_.|
|gloo.podDisruptionBudget.maxUnavailable|string||Corresponds directly with the _maxUnavailable_ field in the [PodDisruptionBudgetSpec](https://kubernetes.io/docs/reference/kubernetes-api/policy-resources/pod-disruption-budget-v1/#PodDisruptionBudgetSpec). This value is mutually exclusive with _minAvailable_.|
|gloo.clearStatusMetrics|bool||Set to true to clear status metrics for deleted resources. This may cause metric scraping to infrequently not see status metrics. Default is false.|
|discovery.deployment.image.tag|string|<release_version, ex: 1.2.3>|The image tag for the container.|
|discovery.deployment.image.repository|string|discovery|The image repository (name) for the container.|
|discovery.deployment.image.digest|string||The container image's hash digest (e.g. 'sha256:12345...'), consumed when variant=standard.|
Expand Down
2 changes: 2 additions & 0 deletions docs/content/static/content/osa_provided.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Name|Version|License
[onsi/gomega](https://github.com/onsi/gomega)|v1.35.0|MIT License
[pkg/browser](https://github.com/pkg/browser)|v0.0.0-20180916011732-0a3d74bf9ce4|BSD 2-clause "Simplified" License
[pkg/errors](https://github.com/pkg/errors)|v0.9.1|BSD 2-clause "Simplified" License
[prometheus/client_model](https://github.com/prometheus/client_model)|v0.6.1|Apache License 2.0
[prometheus/common](https://github.com/prometheus/common)|v0.60.1|Apache License 2.0
[go-ruleguard/dsl](https://github.com/quasilyte/go-ruleguard)|v0.3.22|BSD 3-clause "New" or "Revised" License
[rotisserie/eris](https://github.com/rotisserie/eris)|v0.5.4|MIT License
[saiskee/gettercheck](https://github.com/saiskee/gettercheck)|v0.0.0-20210820204958-38443d06ebe0|MIT License
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ require (
github.com/google/go-cmp v0.6.0
github.com/google/uuid v1.6.0
github.com/mccutchen/go-httpbin/v2 v2.15.0
github.com/prometheus/client_model v0.6.1
github.com/prometheus/common v0.60.1
github.com/quasilyte/go-ruleguard/dsl v0.3.22
github.com/stoewer/go-strcase v1.3.0
github.com/stretchr/testify v1.9.0
Expand Down Expand Up @@ -283,8 +285,6 @@ require (
github.com/planetscale/vtprotobuf v0.6.1-0.20240409071808-615f978279ca // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.20.5 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.60.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/prometheus/statsd_exporter v0.21.0 // indirect
github.com/pseudomuto/protoc-gen-doc v1.5.1 // indirect
Expand Down
1 change: 1 addition & 0 deletions install/helm/gloo/generate/values.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ type Gloo struct {
DisableLeaderElection *bool `json:"disableLeaderElection,omitempty" desc:"Set to true to disable leader election, and ensure all running replicas are considered the leader. Do not enable this with multiple replicas of Gloo"`
HeaderSecretRefNsMatchesUs *bool `json:"headerSecretRefNsMatchesUs,omitempty" desc:"Set to true to require that secrets sent in headers via headerSecretRefs come from the same namespace as the destination upstream. Default: false"`
PodDisruptionBudget *PodDisruptionBudget `json:"podDisruptionBudget,omitempty"`
ClearStatusMetrics *bool `json:"clearStatusMetrics,omitempty" desc:"Set to true to clear status metrics for deleted resources. This may cause metric scraping to infrequently not see status metrics. Default is false."`
}

type KubeGateway struct {
Expand Down
4 changes: 4 additions & 0 deletions install/helm/gloo/templates/1-gloo-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ spec:
- name: GLOO_MTLS_SDS_ENABLED
value: "true"
{{- end }}
{{- if .Values.gloo.clearStatusMetrics }}
- name: GLOO_CLEAR_STATUS_METRICS
value: "true"
{{- end }}
{{- if not .Values.global.glooMtls.enabled }}
readinessProbe:
tcpSocket:
Expand Down
5 changes: 5 additions & 0 deletions pkg/utils/kubeutils/portforward/api_forwarder.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ func (f *apiPortForwarder) Address() string {
return net.JoinHostPort(f.properties.localAddress, strconv.Itoa(f.properties.localPort))
}

// LocalPort returns the local port that is being forwarded to the remote port
func (f *apiPortForwarder) LocalPort() int {
return f.properties.localPort
}

func (f *apiPortForwarder) Close() {
close(f.stopCh)
// Closing the stop channel should close anything
Expand Down
5 changes: 5 additions & 0 deletions pkg/utils/kubeutils/portforward/cli_portforwarder.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ func (c *cliPortForwarder) Address() string {
return net.JoinHostPort(c.properties.localAddress, strconv.Itoa(c.properties.localPort))
}

// LocalPort returns the local port that is being forwarded to the remote port
func (c *cliPortForwarder) LocalPort() int {
return c.properties.localPort
}

func (c *cliPortForwarder) Close() {
if c.cmdCancel != nil {
c.cmdCancel()
Expand Down
3 changes: 3 additions & 0 deletions pkg/utils/kubeutils/portforward/portforwarder.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ type PortForwarder interface {
// Address returns the local forwarded address. Only valid while the apiPortForwarder is running.
Address() string

// Local port that is being forwarded to the remote port
LocalPort() int

// Close this apiPortForwarder and release any resources.
Close()

Expand Down
68 changes: 61 additions & 7 deletions pkg/utils/statsutils/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,27 @@ import (
"context"
"fmt"
"strings"
"time"

errors "github.com/rotisserie/eris"
"github.com/solo-io/gloo/pkg/utils/envutils"
"github.com/solo-io/gloo/pkg/utils/statsutils"
gwv1 "github.com/solo-io/gloo/projects/gateway/pkg/api/v1"
gloov1 "github.com/solo-io/gloo/projects/gloo/pkg/api/v1"
"github.com/solo-io/go-utils/contextutils"
"github.com/solo-io/solo-kit/pkg/api/v1/resources"
"github.com/solo-io/solo-kit/pkg/api/v1/resources/core"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/util/jsonpath"
)

const (
ClearStatusMetricsEnvVar = "GLOO_CLEAR_STATUS_METRICS"
)

type MetricLabels = gloov1.Settings_ObservabilityOptions_MetricLabels

var Names = map[schema.GroupVersionKind]string{
Expand All @@ -43,6 +50,7 @@ var descriptions = map[schema.GroupVersionKind]string{
// ConfigStatusMetrics is a collection of metrics, each of which records if the configuration for
// a particular resource type is valid
type ConfigStatusMetrics struct {
opts map[string]*MetricLabels
metrics map[schema.GroupVersionKind]*resourceMetric
}

Expand All @@ -60,21 +68,33 @@ func GetDefaultConfigStatusOptions() map[string]*MetricLabels {
// NewConfigStatusMetrics creates and returns a ConfigStatusMetrics from the specified options.
// If the options are invalid, an error is returned.
func NewConfigStatusMetrics(opts map[string]*MetricLabels) (ConfigStatusMetrics, error) {
metrics, err := prepareMetrics(opts)
if err != nil {
return ConfigStatusMetrics{}, err
}

configMetrics := ConfigStatusMetrics{
metrics: make(map[schema.GroupVersionKind]*resourceMetric),
opts: opts,
metrics: metrics,
}

return configMetrics, nil
}

func prepareMetrics(opts map[string]*MetricLabels) (map[schema.GroupVersionKind]*resourceMetric, error) {
metrics := make(map[schema.GroupVersionKind]*resourceMetric)
for gvkString, labels := range opts {
gvk, err := parseGroupVersionKind(gvkString)
if err != nil {
return ConfigStatusMetrics{}, err
return map[schema.GroupVersionKind]*resourceMetric{}, err
}
metric, err := newResourceMetric(gvk, labels.GetLabelToPath())
if err != nil {
return ConfigStatusMetrics{}, err
return map[schema.GroupVersionKind]*resourceMetric{}, err
}
configMetrics.insertMetric(gvk, metric)
metrics[gvk] = metric
}
return configMetrics, nil
return metrics, nil
}

func parseGroupVersionKind(arg string) (schema.GroupVersionKind, error) {
Expand Down Expand Up @@ -156,8 +176,42 @@ func (m *ConfigStatusMetrics) SetResourceInvalid(ctx context.Context, resource r
}
}

func (m *ConfigStatusMetrics) insertMetric(gvk schema.GroupVersionKind, metric *resourceMetric) {
m.metrics[gvk] = metric
// ClearMetrics removes all metrics from the ConfigStatusMetrics
func (m *ConfigStatusMetrics) ClearMetrics(ctx context.Context) {
// Our current metrics package uses a channel to unregister views,
// forcing callers sleep after calling ClearMetrics.
// We are concerned that required sleep may cause metrics to flicker.
// So, we are making this behavior opt-in.
// This is a temporary solution until we upgrade to another metrics package.
if !envutils.IsEnvTruthy(ClearStatusMetricsEnvVar) {
return
}

someViewsUnregistered := false

// Iterate through the resource metrics and unregister them
for _, metric := range m.metrics {
v := view.Find(metric.gauge.Name())
if v != nil {
view.Unregister(v)
someViewsUnregistered = true
}
}

// Only sleep when some metrics were unregistered
if someViewsUnregistered {
// Wait for the view to be unregistered (a channel is used)
// This is necessary because the view is unregistered asynchronously.
// We may not need this after we upgrade to an newer metrics package
time.Sleep(1 * time.Second)
}

// Add fresh metrics
var err error
m.metrics, err = prepareMetrics(m.opts)
if err != nil {
contextutils.LoggerFrom(ctx).Errorf("Error clearing resource metrics: %s", err.Error())
}
}

func getMutators(metric *resourceMetric, resource resources.Resource) ([]tag.Mutator, error) {
Expand Down
69 changes: 69 additions & 0 deletions pkg/utils/statsutils/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package metrics_test

import (
"context"
"os"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -131,4 +132,72 @@ var _ = Describe("ConfigStatusMetrics Test", func() {
Entry("Secret", "Secret.v1.gloo.solo.io", metrics.Names[gloov1.SecretGVK], makeSecret),
Entry("Proxy", "Proxy.v1.gloo.solo.io", metrics.Names[gloov1.ProxyGVK], makeProxy),
)

Describe("ClearMetrics", func() {
previousValue := os.Getenv(metrics.ClearStatusMetricsEnvVar)

AfterEach(func() {
os.Setenv(metrics.ClearStatusMetricsEnvVar, previousValue)
})

It("should clear metrics", func() {
os.Setenv(metrics.ClearStatusMetricsEnvVar, "true")

metricName, ok := metrics.Names[gwv1.VirtualServiceGVK]
Expect(ok).To(BeTrue())

opts := map[string]*metrics.MetricLabels{
"VirtualService.v1.gateway.solo.io": {
LabelToPath: map[string]string{
"name": "{.metadata.name}",
},
},
}
c, err := metrics.NewConfigStatusMetrics(opts)
Expect(err).NotTo(HaveOccurred())
Expect(c).NotTo(BeNil())

// Create two resources
res := makeVirtualService("clear")
resName := res.GetMetadata().GetName()

c.SetResourceInvalid(context.TODO(), res)
Expect(helpers.ReadMetricByLabel(metricName, "name", resName)).To(Equal(1))

c.ClearMetrics(context.TODO())
_, err = helpers.ReadMetricByLabel(metricName, "name", resName)
Expect(err).To(HaveOccurred())
})

It("should not clear metrics if the environment variable is not set", func() {
os.Unsetenv(metrics.ClearStatusMetricsEnvVar)

metricName, ok := metrics.Names[gwv1.VirtualServiceGVK]
Expect(ok).To(BeTrue())

opts := map[string]*metrics.MetricLabels{
"VirtualService.v1.gateway.solo.io": {
LabelToPath: map[string]string{
"name": "{.metadata.name}",
},
},
}
c, err := metrics.NewConfigStatusMetrics(opts)
Expect(err).NotTo(HaveOccurred())
Expect(c).NotTo(BeNil())

// Create two resources
res := makeVirtualService("clear")
resName := res.GetMetadata().GetName()

c.SetResourceInvalid(context.TODO(), res)
Expect(helpers.ReadMetricByLabel(metricName, "name", resName)).To(Equal(1))

c.ClearMetrics(context.TODO())

v, err := helpers.ReadMetricByLabel(metricName, "name", resName)
Expect(err).NotTo(HaveOccurred())
Expect(v).To(Equal(1))
})
})
})
6 changes: 1 addition & 5 deletions pkg/utils/statsutils/statsutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,7 @@ func MeasureOne(ctx context.Context, counter *stats.Int64Measure, tags ...tag.Mu

// Measure records the given value to the given counter
func Measure(ctx context.Context, counter *stats.Int64Measure, val int64, tags ...tag.Mutator) {
if err := stats.RecordWithTags(
ctx,
tags,
counter.M(val),
); err != nil {
if err := stats.RecordWithTags(ctx, tags, counter.M(val)); err != nil {
contextutils.LoggerFrom(ctx).Errorf("setting counter %v: %v", counter.Name(), err)
}
}
Expand Down
Loading

0 comments on commit 0719073

Please sign in to comment.