Skip to content

Commit 236deb6

Browse files
committed
pkg/*: Refactor metric visibility handling across the codebase
This commit replaces the `Essential` field with a `Visibility` field in various metric definitions, updating the metric metadata to categorize them as INTERNAL, SUPPORT, or ESSENTIAL. This change enhances clarity regarding the intended audience for each metric and ensures consistent handling of metric visibility throughout the codebase. Part of: CRDB-57261 Epic: CRDB-55082 Release note: None
1 parent 65848d0 commit 236deb6

File tree

30 files changed

+325
-295
lines changed

30 files changed

+325
-295
lines changed

build/tools/gen-cockroachdb-metrics/main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ type MetricInfo struct {
5959
Aggregation string `yaml:"aggregation"`
6060
Derivative string `yaml:"derivative"`
6161
HowToUse string `yaml:"how_to_use,omitempty"`
62-
Essential bool `yaml:"essential,omitempty"`
62+
Visibility string `yaml:"visibility,omitempty"`
6363
}
6464

6565
// Category represents a category of metrics
@@ -170,9 +170,9 @@ func parseDatadogMappings(r io.Reader) (map[string]string, error) {
170170

171171
// CRDB-Datadog mappings are stored as python dictionaries in the following file:
172172
// https://github.com/DataDog/integrations-core/blob/master/cockroachdb/datadog_checks/cockroachdb/metrics.py
173-
// - METRIC_MAP: represents the raw CRDB-Datadog metric name mapping.
173+
// - METRIC_MAP: represents the raw CRDB-Datadog metric name mapping.
174174
// - OMV2_METRIC_MAP: represents the metric in OpenMetrics V2 format.
175-
// E.g.
175+
// E.g.
176176
// 'admission_errored_sql_kv_response': 'admission.errored.sql_kv.response'
177177
// here the key is the CRDB metric name in prometheus format, and the value is the corresponding metric name visible in Datadog.
178178
// Both maps are mutually exclusive. Parse both dictionaries to get the complete mapping.

docs/generated/metrics/metrics.yaml

Lines changed: 126 additions & 126 deletions
Large diffs are not rendered by default.

pkg/backup/schedule_exec.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -604,7 +604,7 @@ func init() {
604604
`),
605605
Measurement: "Jobs",
606606
Unit: metric.Unit_TIMESTAMP_SEC,
607-
Essential: true,
607+
Visibility: metric.Metadata_ESSENTIAL,
608608
Category: metric.Metadata_SQL,
609609
HowToUse: crstrings.UnwrapText(`
610610
Monitor this metric to ensure that backups are meeting the

pkg/base/license.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ var LicenseTTLMetadata = metric.Metadata{
4141
Help: "Seconds until license expiry (0 if no license present)",
4242
Measurement: "Seconds",
4343
Unit: metric.Unit_SECONDS,
44-
Essential: true,
44+
Visibility: metric.Metadata_ESSENTIAL,
4545
Category: metric.Metadata_EXPIRATIONS,
4646
HowToUse: "See Description.",
4747
}
@@ -51,7 +51,7 @@ var AdditionalLicenseTTLMetadata = metric.Metadata{
5151
Help: "Seconds until license expiry (0 if no license present)",
5252
Measurement: "Seconds",
5353
Unit: metric.Unit_SECONDS,
54-
Essential: true,
54+
Visibility: metric.Metadata_ESSENTIAL,
5555
Category: metric.Metadata_EXPIRATIONS,
5656
HowToUse: "See Description.",
5757
}

pkg/ccl/changefeedccl/metrics.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ var (
765765
Help: "Total retryable errors encountered by all changefeeds",
766766
Measurement: "Errors",
767767
Unit: metric.Unit_COUNT,
768-
Essential: true,
768+
Visibility: metric.Metadata_ESSENTIAL,
769769
Category: metric.Metadata_CHANGEFEEDS,
770770
HowToUse: crstrings.UnwrapText(`
771771
This metric tracks transient changefeed errors. Alert on "too many"
@@ -781,7 +781,7 @@ var (
781781
Help: "Total number of changefeed jobs which have failed",
782782
Measurement: "Errors",
783783
Unit: metric.Unit_COUNT,
784-
Essential: true,
784+
Visibility: metric.Metadata_ESSENTIAL,
785785
Category: metric.Metadata_CHANGEFEEDS,
786786
HowToUse: crstrings.UnwrapText(`
787787
This metric tracks the permanent changefeed job failures that the jobs
@@ -866,7 +866,7 @@ func newAggregateMetrics(histogramWindow time.Duration, lookup *cidr.Lookup) *Ag
866866
Help: "Messages emitted by all feeds",
867867
Measurement: "Messages",
868868
Unit: metric.Unit_COUNT,
869-
Essential: true,
869+
Visibility: metric.Metadata_ESSENTIAL,
870870
Category: metric.Metadata_CHANGEFEEDS,
871871
HowToUse: crstrings.UnwrapText(`
872872
This metric provides a useful context when assessing the state of
@@ -894,7 +894,7 @@ func newAggregateMetrics(histogramWindow time.Duration, lookup *cidr.Lookup) *Ag
894894
Help: "Bytes emitted by all feeds",
895895
Measurement: "Bytes",
896896
Unit: metric.Unit_BYTES,
897-
Essential: true,
897+
Visibility: metric.Metadata_ESSENTIAL,
898898
Category: metric.Metadata_CHANGEFEEDS,
899899
HowToUse: crstrings.UnwrapText(`
900900
This metric provides a useful context when assessing the state of
@@ -945,7 +945,7 @@ func newAggregateMetrics(histogramWindow time.Duration, lookup *cidr.Lookup) *Ag
945945
`),
946946
Measurement: "Nanoseconds",
947947
Unit: metric.Unit_NANOSECONDS,
948-
Essential: true,
948+
Visibility: metric.Metadata_ESSENTIAL,
949949
Category: metric.Metadata_CHANGEFEEDS,
950950
HowToUse: crstrings.UnwrapText(`
951951
This metric provides a useful context when assessing the state of
@@ -982,7 +982,7 @@ func newAggregateMetrics(histogramWindow time.Duration, lookup *cidr.Lookup) *Ag
982982
Help: "Number of currently running changefeeds, including sinkless",
983983
Measurement: "Changefeeds",
984984
Unit: metric.Unit_COUNT,
985-
Essential: true,
985+
Visibility: metric.Metadata_ESSENTIAL,
986986
Category: metric.Metadata_CHANGEFEEDS,
987987
HowToUse: `This metric tracks the total number of all running changefeeds.`,
988988
}

pkg/cli/gen.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ type MetricInfo struct {
4747
Aggregation string `yaml:"aggregation"`
4848
Derivative string `yaml:"derivative"`
4949
HowToUse string `yaml:"how_to_use,omitempty"`
50-
Essential bool `yaml:"essential,omitempty"`
50+
Visibility string `yaml:"visibility,omitempty"`
5151
}
5252

5353
type Category struct {
@@ -477,6 +477,11 @@ func generateMetricList(ctx context.Context, skipFiltering bool) (map[string]*La
477477

478478
for _, chart := range section.Charts {
479479
// There are many charts, but only 1 metric per chart.
480+
visibility := chart.Metrics[0].Visibility
481+
// Only include visibility if it's not the default INTERNAL value
482+
if visibility == "INTERNAL" {
483+
visibility = ""
484+
}
480485
metric := MetricInfo{
481486
Name: chart.Metrics[0].Name,
482487
ExportedName: chart.Metrics[0].ExportedName,
@@ -488,7 +493,7 @@ func generateMetricList(ctx context.Context, skipFiltering bool) (map[string]*La
488493
Aggregation: chart.Aggregator.String(),
489494
Derivative: chart.Derivative.String(),
490495
HowToUse: strings.TrimSpace(chart.Metrics[0].HowToUse),
491-
Essential: chart.Metrics[0].Essential,
496+
Visibility: visibility,
492497
}
493498
category.Metrics = append(category.Metrics, metric)
494499
}

pkg/crosscluster/logical/metrics.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ var (
1818
Name: "logical_replication.events_ingested",
1919
Help: "Events ingested by all replication jobs",
2020
Measurement: "Events",
21-
Essential: true,
21+
Visibility: metric.Metadata_ESSENTIAL,
2222
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
2323
Unit: metric.Unit_COUNT,
2424
HowToUse: "track events (e.g. updates, deletes, inserts) ingested",
@@ -27,15 +27,15 @@ var (
2727
Name: "logical_replication.events_dlqed",
2828
Help: "Row update events sent to DLQ",
2929
Measurement: "Failures",
30-
Essential: true,
30+
Visibility: metric.Metadata_ESSENTIAL,
3131
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
3232
Unit: metric.Unit_COUNT,
3333
HowToUse: "track events sent to the dead letter queue",
3434
}
3535
metaReceivedLogicalBytes = metric.Metadata{
3636
Name: "logical_replication.logical_bytes",
3737
Help: "Logical bytes (sum of keys + values) received by all replication jobs",
38-
Essential: true,
38+
Visibility: metric.Metadata_ESSENTIAL,
3939
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
4040
Measurement: "Bytes",
4141
Unit: metric.Unit_BYTES,
@@ -49,7 +49,7 @@ var (
4949
between the oldest event in the batch and flush is recorded
5050
`),
5151
Measurement: "Nanoseconds",
52-
Essential: true,
52+
Visibility: metric.Metadata_ESSENTIAL,
5353
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
5454
Unit: metric.Unit_NANOSECONDS,
5555
HowToUse: "track the latency of of applying events from source to destination",
@@ -58,7 +58,7 @@ var (
5858
Name: "logical_replication.replicated_time_seconds",
5959
Help: "The replicated time of the logical replication stream in seconds since the unix epoch.",
6060
Measurement: "Seconds",
61-
Essential: true,
61+
Visibility: metric.Metadata_ESSENTIAL,
6262
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
6363
Unit: metric.Unit_SECONDS,
6464
HowToUse: "Track replication lag via current time - logical_replication.replicated_time_seconds",

pkg/crosscluster/physical/metrics.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ var (
3535
Name: "physical_replication.logical_bytes",
3636
Help: "Logical bytes (sum of keys + values) ingested by all replication jobs",
3737
Measurement: "Bytes",
38-
Essential: true,
38+
Visibility: metric.Metadata_ESSENTIAL,
3939
Category: metric.Metadata_CROSS_CLUSTER_REPLICATION,
4040
Unit: metric.Unit_BYTES,
4141
HowToUse: "Track PCR throughput",
@@ -79,7 +79,7 @@ var (
7979
Name: "physical_replication.replicated_time_seconds",
8080
Help: "The replicated time of the physical replication stream in seconds since the unix epoch.",
8181
Measurement: "Seconds",
82-
Essential: true,
82+
Visibility: metric.Metadata_ESSENTIAL,
8383
Category: metric.Metadata_CROSS_CLUSTER_REPLICATION,
8484
Unit: metric.Unit_SECONDS,
8585
HowToUse: "Track replication lag via current time - physical_replication.replicated_time_seconds",

pkg/jobs/metrics.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ func makeMetaCurrentlyRunning(jt jobspb.Type) metric.Metadata {
104104

105105
switch jt {
106106
case jobspb.TypeCreateStats, jobspb.TypeAutoCreateStats, jobspb.TypeAutoCreatePartialStats:
107-
m.Essential = true
107+
m.Visibility = metric.Metadata_ESSENTIAL
108108
m.Category = metric.Metadata_SQL
109109
var detail string
110110
if jt == jobspb.TypeCreateStats {
@@ -116,11 +116,11 @@ func makeMetaCurrentlyRunning(jt jobspb.Type) metric.Metadata {
116116
}
117117
m.HowToUse = fmt.Sprintf(`This metric tracks the number of active %s statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.`, detail)
118118
case jobspb.TypeBackup:
119-
m.Essential = true
119+
m.Visibility = metric.Metadata_ESSENTIAL
120120
m.Category = metric.Metadata_SQL
121121
m.HowToUse = `See Description.`
122122
case jobspb.TypeRowLevelTTL:
123-
m.Essential = true
123+
m.Visibility = metric.Metadata_ESSENTIAL
124124
m.Category = metric.Metadata_TTL
125125
m.HowToUse = `Monitor this metric to ensure there are not too many Row Level TTL jobs running at the same time. Generally, this metric should be in the low single digits.`
126126
}
@@ -162,23 +162,23 @@ func makeMetaCurrentlyPaused(jt jobspb.Type) metric.Metadata {
162162
}
163163
switch jt {
164164
case jobspb.TypeAutoCreateStats, jobspb.TypeAutoCreatePartialStats:
165-
m.Essential = true
165+
m.Visibility = metric.Metadata_ESSENTIAL
166166
m.Category = metric.Metadata_SQL
167167
var partialDetail string
168168
if jt == jobspb.TypeAutoCreatePartialStats {
169169
partialDetail = "partial "
170170
}
171171
m.HowToUse = fmt.Sprintf(`This metric is a high-level indicator that automatically generated %sstatistics jobs are paused which can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.`, partialDetail)
172172
case jobspb.TypeBackup:
173-
m.Essential = true
173+
m.Visibility = metric.Metadata_ESSENTIAL
174174
m.Category = metric.Metadata_SQL
175175
m.HowToUse = `Monitor and alert on this metric to safeguard against an inadvertent operational error of leaving a backup job in a paused state for an extended period of time. In functional areas, a paused job can hold resources or have concurrency impact or some other negative consequence. Paused backup may break the recovery point objective (RPO).`
176176
case jobspb.TypeChangefeed:
177-
m.Essential = true
177+
m.Visibility = metric.Metadata_ESSENTIAL
178178
m.Category = metric.Metadata_CHANGEFEEDS
179179
m.HowToUse = `Monitor and alert on this metric to safeguard against an inadvertent operational error of leaving a changefeed job in a paused state for an extended period of time. Changefeed jobs should not be paused for a long time because the protected timestamp prevents garbage collection.`
180180
case jobspb.TypeRowLevelTTL:
181-
m.Essential = true
181+
m.Visibility = metric.Metadata_ESSENTIAL
182182
m.Category = metric.Metadata_TTL
183183
m.HowToUse = `Monitor this metric to ensure the Row Level TTL job does not remain paused inadvertently for an extended period.`
184184
}
@@ -203,7 +203,7 @@ func makeMetaResumeCompeted(jt jobspb.Type) metric.Metadata {
203203

204204
switch jt {
205205
case jobspb.TypeRowLevelTTL:
206-
m.Essential = true
206+
m.Visibility = metric.Metadata_ESSENTIAL
207207
m.Category = metric.Metadata_TTL
208208
m.HowToUse = `If Row Level TTL is enabled, this metric should be nonzero and correspond to the ttl_cron setting that was chosen. If this metric is zero, it means the job is not running`
209209
}
@@ -245,15 +245,15 @@ func makeMetaResumeFailed(jt jobspb.Type) metric.Metadata {
245245

246246
switch jt {
247247
case jobspb.TypeAutoCreateStats, jobspb.TypeAutoCreatePartialStats:
248-
m.Essential = true
248+
m.Visibility = metric.Metadata_ESSENTIAL
249249
m.Category = metric.Metadata_SQL
250250
var partialDetail string
251251
if jt == jobspb.TypeAutoCreatePartialStats {
252252
partialDetail = "partial "
253253
}
254254
m.HowToUse = fmt.Sprintf(`This metric is a high-level indicator that automatically generated %stable statistics is failing. Failed statistic creation can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.`, partialDetail)
255255
case jobspb.TypeRowLevelTTL:
256-
m.Essential = true
256+
m.Visibility = metric.Metadata_ESSENTIAL
257257
m.Category = metric.Metadata_TTL
258258
m.HowToUse = `This metric should remain at zero. Repeated errors means the Row Level TTL job is not deleting data.`
259259
}
@@ -327,7 +327,7 @@ func makeMetaProtectedAge(jt jobspb.Type) metric.Metadata {
327327

328328
switch jt {
329329
case jobspb.TypeChangefeed:
330-
m.Essential = true
330+
m.Visibility = metric.Metadata_ESSENTIAL
331331
m.Category = metric.Metadata_CHANGEFEEDS
332332
m.HowToUse = `Changefeeds use protected timestamps to protect the data from being garbage collected. Ensure the protected timestamp age does not significantly exceed the GC TTL zone configuration. Alert on this metric if the protected timestamp age is greater than 3 times the GC TTL.`
333333
}

pkg/jobs/schedule_metrics.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,13 @@ func MakeExecutorMetrics(name string) ExecutorMetrics {
144144
}
145145

146146
if name == tree.ScheduledBackupExecutor.UserName() {
147-
m.NumFailed.Essential = true
147+
m.NumFailed.Visibility = metric.Metadata_ESSENTIAL
148148
m.NumFailed.Category = metric.Metadata_SQL
149149
m.NumFailed.HowToUse = `Monitor this metric and investigate backup job failures.`
150150
}
151151

152152
if name == tree.ScheduledRowLevelTTLExecutor.InternalName() {
153-
m.NumFailed.Essential = true
153+
m.NumFailed.Visibility = metric.Metadata_ESSENTIAL
154154
m.NumFailed.Category = metric.Metadata_TTL
155155
m.NumFailed.HowToUse = `Monitor this metric to ensure the Row Level TTL job is running. If it is non-zero, it means the job could not be created.`
156156
}

0 commit comments

Comments
 (0)