Skip to content

Commit cfba6ea

Browse files
committed
metric(aggmetric): introduce options for high cardinality metrics
Previously, we have introduced high cardinality metrics. This patch introduces `HighCardinalityMetricOptions` to support user defined configurations. It has below fields: - Metadata: Metadata represents the metadata associated with metrics. - MaxLabelValues (default: 5000): MaxLabelValues sets the maximum number of distinct label value combinations that can be stored before eviction. - RetentionTimeTillEviction (default: 20 seconds): specifies the time duration after which unused label value combinations can be evicted. Entries that haven't been accessed for longer than this duration may be evicted. A label value is removed when number of label values are greater than MaxLabelValues and it has passed the RetentionTimeTillEviction. This commit also introduces 2 environment variables: - COCKROACH_HIGH_CARDINALITY_METRICS_MAX_LABEL_VALUES (default:0): The maximum value of the variable and `MaxLabelValues` is used during initialisation for max label combinations. - COCKROACH_HIGH_CARDINALITY_METRICS_RETENTION_TIME_TILL_EVICTION (default:10): The maximum value of the variable and `RetentionTimeTillEviction` is used during initialisation for max retention of a label. Epic: CRDB-53398 Part of: CRDB-54635 Release notes: None
1 parent 40466eb commit cfba6ea

15 files changed

+868
-31
lines changed

pkg/util/metric/aggmetric/agg_metric.go

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,14 @@ import (
2828
var delimiter = []byte{'_'}
2929

3030
const (
31-
dbLabel = "database"
32-
appLabel = "application_name"
33-
cacheSize = 5000
34-
retentionTimeTillEviction = 20 * time.Second
31+
dbLabel = "database"
32+
appLabel = "application_name"
33+
// defaultCacheSize is the default maximum number of distinct label value combinations
34+
// before eviction starts in high cardinality metrics.
35+
defaultCacheSize = 5000
36+
// defaultRetentionTimeTillEviction is the default duration after which unused
37+
// label value combinations can be evicted from high cardinality metrics.
38+
defaultRetentionTimeTillEviction = 20 * time.Second
3539
)
3640

3741
// This is a no-op context used during logging.
@@ -106,22 +110,38 @@ func (cs *childSet) initWithBTreeStorageType(labels []string) {
106110
}
107111
}
108112

109-
func (cs *childSet) initWithCacheStorageType(labels []string, metricName string) {
113+
func (cs *childSet) initWithCacheStorageType(
114+
labels []string, metricName string, opts metric.HighCardinalityMetricOptions,
115+
) {
110116
cs.labels = labels
111117

118+
// Determine maxLabelValues: use the maximum of env variable and metric defined opts.
119+
maxLabelValues := opts.MaxLabelValues
120+
if maxLabelValues == 0 {
121+
maxLabelValues = defaultCacheSize
122+
}
123+
maxLabelValues = max(maxLabelValues, metric.MaxLabelValues)
124+
125+
// Determine retentionDuration: use the maximum of env variable and metric defined opts.
126+
retentionDuration := opts.RetentionTimeTillEviction
127+
if retentionDuration == 0 {
128+
retentionDuration = defaultRetentionTimeTillEviction
129+
}
130+
retentionDuration = max(retentionDuration, metric.RetentionTimeTillEviction)
131+
112132
cs.mu.children = &UnorderedCacheWrapper{
113133
cache: cache.NewUnorderedCache(cache.Config{
114134
Policy: cache.CacheLRU,
115135
ShouldEvict: func(size int, key, value any) bool {
116136
if childMetric, ok := value.(ChildMetric); ok {
117-
// Check if the child metric has exceeded 20 seconds and cache size is greater than 5000
137+
// Check if the child metric has exceeded the retention time and cache size is greater than max
118138
if labelSliceCachedChildMetric, ok := childMetric.(LabelSliceCachedChildMetric); ok {
119139
currentTime := timeutil.Now()
120140
age := currentTime.Sub(labelSliceCachedChildMetric.CreatedAt())
121-
return size > cacheSize && age > retentionTimeTillEviction
141+
return size > maxLabelValues && age > retentionDuration
122142
}
123143
}
124-
return size > cacheSize
144+
return size > maxLabelValues
125145
},
126146
OnEvictedEntry: func(entry *cache.Entry) {
127147
if childMetric, ok := entry.Value.(ChildMetric); ok {
@@ -144,9 +164,8 @@ func (cs *childSet) initWithCacheStorageType(labels []string, metricName string)
144164
func getCacheStorage() *cache.UnorderedCache {
145165
cacheStorage := cache.NewUnorderedCache(cache.Config{
146166
Policy: cache.CacheLRU,
147-
//TODO (aa-joshi) : make cacheSize configurable in the future
148167
ShouldEvict: func(size int, key, value interface{}) bool {
149-
return size > cacheSize
168+
return size > defaultCacheSize
150169
},
151170
})
152171
return cacheStorage

pkg/util/metric/aggmetric/agg_metric_test.go

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,3 +495,183 @@ func TestConcurrentUpdatesAndReinitialiseMetric(t *testing.T) {
495495
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(true), metric.WithIncludeAggregateMetrics(true))
496496
})
497497
}
498+
499+
// TestHighCardinalityMetricsWithOptions tests all high cardinality metric types
500+
// (Counter, Gauge, Histogram) with both custom and default options.
501+
func TestHighCardinalityMetricsWithOptions(t *testing.T) {
502+
defer leaktest.AfterTest(t)()
503+
504+
// Test with custom options
505+
t.Run("CustomOptions", func(t *testing.T) {
506+
// Save and restore the original values
507+
originalMaxLabelValues := metric.MaxLabelValues
508+
originalRetentionTime := metric.RetentionTimeTillEviction
509+
defer func() {
510+
metric.MaxLabelValues = originalMaxLabelValues
511+
metric.RetentionTimeTillEviction = originalRetentionTime
512+
}()
513+
514+
const customCacheSize = 5
515+
const customRetention = 2 * time.Second
516+
517+
// Set the package-level variables directly since they're cached at package load time
518+
metric.MaxLabelValues = 5
519+
metric.RetentionTimeTillEviction = 2 * time.Second
520+
521+
r := metric.NewRegistry()
522+
writePrometheusMetrics := WritePrometheusMetricsFunc(r)
523+
524+
// Create all three metric types with custom options
525+
counter := NewHighCardinalityCounter(
526+
metric.HighCardinalityMetricOptions{
527+
Metadata: metric.Metadata{Name: "custom_options_counter"},
528+
MaxLabelValues: customCacheSize,
529+
RetentionTimeTillEviction: customRetention,
530+
},
531+
"database", "application_name",
532+
)
533+
r.AddMetric(counter)
534+
535+
gauge := NewHighCardinalityGauge(
536+
metric.HighCardinalityMetricOptions{
537+
Metadata: metric.Metadata{Name: "custom_options_gauge"},
538+
MaxLabelValues: customCacheSize,
539+
RetentionTimeTillEviction: customRetention,
540+
},
541+
"database", "application_name",
542+
)
543+
r.AddMetric(gauge)
544+
545+
histogram := NewHighCardinalityHistogram(
546+
metric.HistogramOptions{
547+
Metadata: metric.Metadata{
548+
Name: "custom_options_histogram",
549+
},
550+
Duration: base.DefaultHistogramWindowInterval(),
551+
MaxVal: 100,
552+
SigFigs: 1,
553+
BucketConfig: metric.Percent100Buckets,
554+
HighCardinalityOpts: metric.HighCardinalityMetricOptions{
555+
MaxLabelValues: customCacheSize,
556+
RetentionTimeTillEviction: customRetention,
557+
},
558+
},
559+
"database", "application_name",
560+
)
561+
r.AddMetric(histogram)
562+
563+
// Initialize with label slice caches
564+
labelSliceCache := metric.NewLabelSliceCache()
565+
counter.InitializeMetrics(labelSliceCache)
566+
gauge.InitializeMetrics(labelSliceCache)
567+
histogram.InitializeMetrics(labelSliceCache)
568+
569+
// Add more entries than the custom cache size
570+
for i := 0; i < customCacheSize+3; i++ {
571+
counter.Inc(1, "db1", strconv.Itoa(i))
572+
gauge.Update(int64(i+1), "db1", strconv.Itoa(i))
573+
histogram.RecordValue(int64(i+1), "db1", strconv.Itoa(i))
574+
}
575+
576+
// Wait for custom retention time to pass
577+
time.Sleep(customRetention + 500*time.Millisecond)
578+
579+
testFile := "HighCardinalityMetrics_custom_options_pre_eviction.txt"
580+
if metric.HdrEnabled() {
581+
testFile = "HighCardinalityMetrics_custom_options_pre_eviction_hdr.txt"
582+
}
583+
echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile))
584+
585+
// Add new entries to trigger eviction
586+
for i := customCacheSize + 3; i < customCacheSize+6; i++ {
587+
counter.Inc(1, "db2", strconv.Itoa(i))
588+
gauge.Inc(5, "db2", strconv.Itoa(i))
589+
histogram.RecordValue(int64(i+10), "db2", strconv.Itoa(i))
590+
}
591+
592+
testFile = "HighCardinalityMetrics_custom_options_post_eviction.txt"
593+
if metric.HdrEnabled() {
594+
testFile = "HighCardinalityMetrics_custom_options_post_eviction_hdr.txt"
595+
}
596+
echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile))
597+
598+
// Verify that old entries were evicted
599+
for i := 0; i < 3; i++ {
600+
metricKey := metric.LabelSliceCacheKey(metricKey("db1", strconv.Itoa(i)))
601+
_, ok := labelSliceCache.Get(metricKey)
602+
require.False(t, ok, "old entry should have been evicted")
603+
}
604+
})
605+
606+
// Test with default options
607+
t.Run("DefaultOptions", func(t *testing.T) {
608+
r := metric.NewRegistry()
609+
writePrometheusMetrics := WritePrometheusMetricsFunc(r)
610+
611+
// Create all three metric types with default options
612+
counter := NewHighCardinalityCounter(
613+
metric.HighCardinalityMetricOptions{
614+
Metadata: metric.Metadata{Name: "default_options_counter"},
615+
MaxLabelValues: 0, // Should default to 5000
616+
RetentionTimeTillEviction: 0, // Should default to 20 seconds
617+
},
618+
"database", "application_name",
619+
)
620+
r.AddMetric(counter)
621+
622+
gauge := NewHighCardinalityGauge(
623+
metric.HighCardinalityMetricOptions{
624+
Metadata: metric.Metadata{Name: "default_options_gauge"},
625+
MaxLabelValues: 0, // Should default to 5000
626+
RetentionTimeTillEviction: 0, // Should default to 20 seconds
627+
},
628+
"database", "application_name",
629+
)
630+
r.AddMetric(gauge)
631+
632+
histogram := NewHighCardinalityHistogram(
633+
metric.HistogramOptions{
634+
Metadata: metric.Metadata{
635+
Name: "default_options_histogram",
636+
},
637+
Duration: base.DefaultHistogramWindowInterval(),
638+
MaxVal: 100,
639+
SigFigs: 1,
640+
BucketConfig: metric.Percent100Buckets,
641+
HighCardinalityOpts: metric.HighCardinalityMetricOptions{
642+
MaxLabelValues: 0, // Should default to 5000
643+
RetentionTimeTillEviction: 0, // Should default to 20 seconds
644+
},
645+
},
646+
"database", "application_name",
647+
)
648+
r.AddMetric(histogram)
649+
650+
// Initialize with label slice caches
651+
labelSliceCache := metric.NewLabelSliceCache()
652+
counter.InitializeMetrics(labelSliceCache)
653+
gauge.InitializeMetrics(labelSliceCache)
654+
histogram.InitializeMetrics(labelSliceCache)
655+
656+
// Add a few entries
657+
for i := 0; i < 10; i++ {
658+
counter.Inc(1, "db1", strconv.Itoa(i))
659+
gauge.Update(int64(i+1), "db1", strconv.Itoa(i))
660+
histogram.RecordValue(int64(i+1), "db1", strconv.Itoa(i))
661+
}
662+
663+
testFile := "HighCardinalityMetrics_default_options.txt"
664+
if metric.HdrEnabled() {
665+
testFile = "HighCardinalityMetrics_default_options_hdr.txt"
666+
}
667+
echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile))
668+
669+
// Verify all entries are still present (shouldn't evict with default settings)
670+
for i := 0; i < 10; i++ {
671+
metricKey := metric.LabelSliceCacheKey(metricKey("db1", strconv.Itoa(i)))
672+
labelSliceValue, ok := labelSliceCache.Get(metricKey)
673+
require.True(t, ok, "entry should still be present with default settings")
674+
require.Equal(t, int64(3), labelSliceValue.Counter.Load())
675+
}
676+
})
677+
}

pkg/util/metric/aggmetric/counter.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -410,12 +410,14 @@ var _ metric.Iterable = (*HighCardinalityCounter)(nil)
410410
var _ metric.PrometheusEvictable = (*HighCardinalityCounter)(nil)
411411

412412
// NewHighCardinalityCounter constructs a new HighCardinalityCounter that uses cache storage
413-
// with eviction for child metrics.
413+
// with eviction for child metrics. The opts parameter contains the metadata and allows configuring
414+
// the maximum number of label combinations (MaxLabelValues) and retention time (RetentionTimeTillEviction).
415+
// If opts has zero values for MaxLabelValues or RetentionTimeTillEviction, defaults will be used.
414416
func NewHighCardinalityCounter(
415-
metadata metric.Metadata, childLabels ...string,
417+
opts metric.HighCardinalityMetricOptions, childLabels ...string,
416418
) *HighCardinalityCounter {
417-
c := &HighCardinalityCounter{g: *metric.NewCounter(metadata)}
418-
c.initWithCacheStorageType(childLabels, metadata.Name)
419+
c := &HighCardinalityCounter{g: *metric.NewCounter(opts.Metadata)}
420+
c.initWithCacheStorageType(childLabels, opts.Metadata.Name, opts)
419421
return c
420422
}
421423

pkg/util/metric/aggmetric/counter_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,12 @@ func TestHighCardinalityCounter(t *testing.T) {
8282
r := metric.NewRegistry()
8383
writePrometheusMetrics := WritePrometheusMetricsFunc(r)
8484

85-
c := NewHighCardinalityCounter(metric.Metadata{
86-
Name: "foo_counter",
87-
}, "database", "application_name")
85+
c := NewHighCardinalityCounter(
86+
metric.HighCardinalityMetricOptions{
87+
Metadata: metric.Metadata{Name: "foo_counter"},
88+
},
89+
"database", "application_name",
90+
)
8891
c.mu.children = &UnorderedCacheWrapper{
8992
cache: initialiseCacheStorageForTesting(),
9093
}
@@ -144,6 +147,7 @@ func TestHighCardinalityCounter(t *testing.T) {
144147
}
145148

146149
func initialiseCacheStorageForTesting() *cache.UnorderedCache {
150+
const cacheSize = 10
147151
return cache.NewUnorderedCache(cache.Config{
148152
Policy: cache.CacheLRU,
149153
ShouldEvict: func(size int, key, value interface{}) bool {

pkg/util/metric/aggmetric/gauge.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -537,12 +537,14 @@ var _ metric.Iterable = (*HighCardinalityGauge)(nil)
537537
var _ metric.PrometheusEvictable = (*HighCardinalityGauge)(nil)
538538

539539
// NewHighCardinalityGauge constructs a new HighCardinalityGauge that uses cache storage
540-
// with eviction for child metrics.
540+
// with eviction for child metrics. The opts parameter contains the metadata and allows configuring
541+
// the maximum number of label combinations (MaxLabelValues) and retention time (RetentionTimeTillEviction).
542+
// If opts has zero values for MaxLabelValues or RetentionTimeTillEviction, defaults will be used.
541543
func NewHighCardinalityGauge(
542-
metadata metric.Metadata, childLabels ...string,
544+
opts metric.HighCardinalityMetricOptions, childLabels ...string,
543545
) *HighCardinalityGauge {
544-
g := &HighCardinalityGauge{g: *metric.NewGauge(metadata)}
545-
g.initWithCacheStorageType(childLabels, metadata.Name)
546+
g := &HighCardinalityGauge{g: *metric.NewGauge(opts.Metadata)}
547+
g.initWithCacheStorageType(childLabels, opts.Metadata.Name, opts)
546548
return g
547549
}
548550

pkg/util/metric/aggmetric/gauge_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ func TestHighCardinalityGauge(t *testing.T) {
9090
r := metric.NewRegistry()
9191
writePrometheusMetrics := WritePrometheusMetricsFunc(r)
9292

93-
g := NewHighCardinalityGauge(metric.Metadata{
94-
Name: "foo_gauge",
93+
g := NewHighCardinalityGauge(metric.HighCardinalityMetricOptions{
94+
Metadata: metric.Metadata{Name: "foo_gauge"},
9595
}, "database", "application_name")
9696
g.mu.children = &UnorderedCacheWrapper{
9797
cache: initialiseCacheStorageForTesting(),
@@ -155,8 +155,8 @@ func TestHighCardinalityGaugeMethods(t *testing.T) {
155155
r := metric.NewRegistry()
156156
writePrometheusMetrics := WritePrometheusMetricsFunc(r)
157157

158-
g := NewHighCardinalityGauge(metric.Metadata{
159-
Name: "foo_gauge",
158+
g := NewHighCardinalityGauge(metric.HighCardinalityMetricOptions{
159+
Metadata: metric.Metadata{Name: "foo_gauge"},
160160
}, "database", "application_name")
161161
g.mu.children = &UnorderedCacheWrapper{
162162
cache: initialiseCacheStorageForTesting(),

pkg/util/metric/aggmetric/histogram.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,9 @@ var _ metric.WindowedHistogram = (*HighCardinalityHistogram)(nil)
382382
var _ metric.CumulativeHistogram = (*HighCardinalityHistogram)(nil)
383383

384384
// NewHighCardinalityHistogram constructs a new HighCardinalityHistogram that uses cache storage
385-
// with eviction for child metrics.
385+
// with eviction for child metrics. The HighCardinalityOpts field in opts allows configuring
386+
// the maximum number of label combinations (MaxLabelValues) and retention time (RetentionTimeTillEviction).
387+
// If HighCardinalityOpts is not provided or has zero values, defaults will be used.
386388
func NewHighCardinalityHistogram(
387389
opts metric.HistogramOptions, childLabels ...string,
388390
) *HighCardinalityHistogram {
@@ -410,7 +412,7 @@ func NewHighCardinalityHistogram(
410412
childHist.h.Tick()
411413
})
412414
})
413-
h.initWithCacheStorageType(childLabels, opts.Metadata.Name)
415+
h.initWithCacheStorageType(childLabels, opts.Metadata.Name, opts.HighCardinalityOpts)
414416
return h
415417
}
416418

pkg/util/metric/aggmetric/histogram_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,11 @@ func TestHighCardinalityHistogram(t *testing.T) {
9898
Metadata: metric.Metadata{
9999
Name: "histo_gram",
100100
},
101-
Duration: base.DefaultHistogramWindowInterval(),
102-
MaxVal: 100,
103-
SigFigs: 1,
104-
BucketConfig: metric.Percent100Buckets,
101+
Duration: base.DefaultHistogramWindowInterval(),
102+
MaxVal: 100,
103+
SigFigs: 1,
104+
BucketConfig: metric.Percent100Buckets,
105+
HighCardinalityOpts: metric.HighCardinalityMetricOptions{},
105106
}, "database", "application_name")
106107

107108
h.mu.children = &UnorderedCacheWrapper{

0 commit comments

Comments
 (0)