Skip to content

Commit 401bb1b

Browse files
authored
Move invalid metrics_name to a dedicated place and re-implement a small parser to collect them (#24)
Signed-off-by: Augustin Husson <[email protected]>
1 parent 29cf95b commit 401bb1b

File tree

15 files changed

+5884
-99
lines changed

15 files changed

+5884
-99
lines changed

README.md

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@ Metrics Usage
66

77
This tool analyzes static files - like dashboards and Prometheus alert rules - to track where and how Prometheus metrics are used.
88

9-
It’s especially helpful for identifying whether metrics are actively used. Unused metrics should ideally not be scraped by Prometheus to avoid unnecessary load.
9+
It’s especially helpful for identifying whether metrics are actively used.
10+
Prometheus should ideally not scrape unused metrics to avoid an unnecessary load.
11+
12+
## API exposed
13+
14+
### Metrics
1015

1116
The tool provides an API endpoint, `/api/v1/metrics`, which returns the usage data for each collected metric as shown below:
1217

@@ -77,7 +82,41 @@ You can used the following query parameter to filter the list returned:
7782
* **metric_name**: when used, it will trigger a fuzzy search on the metric_name based on the pattern provided.
7883
* **used**: when used, will return only the metric used or not (depending if you set this boolean to true or to false). Leave it empty if you want both.
7984

80-
## How to use it
85+
### Invalid Metrics
86+
87+
The API endpoint `/api/v1/invalid_metrics` is exposing the usage for metrics that contains variable or regexp.
88+
89+
```json
90+
{
91+
"node_cpu_utilization_${instance}": {
92+
"usage": {
93+
"alertRules": [
94+
{
95+
"prom_link": "https://prometheus.demo.do.prometheus.io",
96+
"group_name": "ansible managed alert rules",
97+
"name": "NodeCPUUtilizationHigh",
98+
"expression": "instance:node_cpu_utilisation:rate5m * 100 > ignoring (severity) node_cpu_utilization_percent_threshold{severity=\"critical\"}"
99+
}
100+
]
101+
}
102+
},
103+
"node_disk_discard_time_.+": {
104+
"usage": {
105+
"dashboards": [
106+
"https://demo.perses.dev/api/v1/projects/perses/dashboards/nodeexporterfull"
107+
]
108+
}
109+
}
110+
}
111+
```
112+
113+
### Pending Usage
114+
115+
The API endpoint `/api/v1/pending_usages` is exposing usage associated to metrics that has not yet been associated to the metrics available on the endpoint `/api/v1/metrics`.
116+
117+
It's even possible usage is never associated as the metric doesn't exist anymore.
118+
119+
## Different way to deploy it
81120

82121
### Central instance
83122

database/database.go

Lines changed: 66 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,29 @@ import (
2828
type Database interface {
2929
GetMetric(name string) *v1.Metric
3030
ListMetrics() map[string]*v1.Metric
31+
ListInvalidMetrics() map[string]*v1.Metric
32+
ListPendingUsage() map[string]*v1.MetricUsage
3133
EnqueueMetricList(metrics []string)
34+
EnqueueInvalidMetricsUsage(usages map[string]*v1.MetricUsage)
3235
EnqueueUsage(usages map[string]*v1.MetricUsage)
3336
EnqueueLabels(labels map[string][]string)
3437
}
3538

3639
func New(cfg config.Database) Database {
3740
d := &db{
38-
metrics: make(map[string]*v1.Metric),
39-
usage: make(map[string]*v1.MetricUsage),
40-
usageQueue: make(chan map[string]*v1.MetricUsage, 250),
41-
labelsQueue: make(chan map[string][]string, 250),
42-
metricsQueue: make(chan []string, 10),
43-
path: cfg.Path,
41+
metrics: make(map[string]*v1.Metric),
42+
invalidMetrics: make(map[string]*v1.Metric),
43+
usage: make(map[string]*v1.MetricUsage),
44+
usageQueue: make(chan map[string]*v1.MetricUsage, 250),
45+
invalidMetricsUsageQueue: make(chan map[string]*v1.MetricUsage, 250),
46+
labelsQueue: make(chan map[string][]string, 250),
47+
metricsQueue: make(chan []string, 10),
48+
path: cfg.Path,
4449
}
4550

4651
go d.watchUsageQueue()
4752
go d.watchMetricsQueue()
53+
go d.watchInvalidMetricsUsageQueue()
4854
go d.watchLabelsQueue()
4955
if !*cfg.InMemory {
5056
if err := d.readMetricsInJSONFile(); err != nil {
@@ -60,6 +66,8 @@ type db struct {
6066
// metrics is the list of metric name (as a key) associated to their usage based on the different collector activated.
6167
// This struct is our "database".
6268
metrics map[string]*v1.Metric
69+
// invalidMetrics is the list of metric name that likely contains a variable or a regexp and as such cannot be a valid metric name.
70+
invalidMetrics map[string]*v1.Metric
6371
// usage is a buffer in case the metric name has not yet been collected
6472
usage map[string]*v1.MetricUsage
6573
// metricsQueue is the channel that should be used to send and receive the list of metric name to keep in memory.
@@ -73,6 +81,10 @@ type db struct {
7381
// There will be no other way to write in it.
7482
// Doing that allows us to accept more HTTP requests to write data and to delay the actual writing.
7583
usageQueue chan map[string]*v1.MetricUsage
84+
// invalidMetricsUsageQueue is the way to send the usage per metric that is not valid to write in the database.
85+
// There will be no other way to write in it.
86+
// Doing that allows us to accept more HTTP requests to write data and to delay the actual writing.
87+
invalidMetricsUsageQueue chan map[string]*v1.MetricUsage
7688
// path is the path to the JSON file where metrics is flushed periodically
7789
// It is empty if the database is purely in memory.
7890
path string
@@ -83,37 +95,54 @@ type db struct {
8395
// 1. Then let's flush the data into a file periodically (or once the queue is empty (if it happens))
8496
// 2. Read the file directly when a read query is coming
8597
// Like that we have two different ways to read and write the data.
86-
mutex sync.Mutex
98+
metricsMutex sync.Mutex
99+
invalidMetricsUsageMutex sync.Mutex
87100
}
88101

89102
func (d *db) GetMetric(name string) *v1.Metric {
90-
d.mutex.Lock()
91-
defer d.mutex.Unlock()
103+
d.metricsMutex.Lock()
104+
defer d.metricsMutex.Unlock()
92105
return d.metrics[name]
93106
}
94107

95108
func (d *db) ListMetrics() map[string]*v1.Metric {
96-
d.mutex.Lock()
97-
defer d.mutex.Unlock()
109+
d.metricsMutex.Lock()
110+
defer d.metricsMutex.Unlock()
98111
return d.metrics
99112
}
100113

114+
func (d *db) ListInvalidMetrics() map[string]*v1.Metric {
115+
d.invalidMetricsUsageMutex.Lock()
116+
defer d.invalidMetricsUsageMutex.Unlock()
117+
return d.invalidMetrics
118+
}
119+
101120
func (d *db) EnqueueMetricList(metrics []string) {
102121
d.metricsQueue <- metrics
103122
}
104123

124+
func (d *db) ListPendingUsage() map[string]*v1.MetricUsage {
125+
d.metricsMutex.Lock()
126+
defer d.metricsMutex.Unlock()
127+
return d.usage
128+
}
129+
105130
func (d *db) EnqueueUsage(usages map[string]*v1.MetricUsage) {
106131
d.usageQueue <- usages
107132
}
108133

134+
func (d *db) EnqueueInvalidMetricsUsage(usages map[string]*v1.MetricUsage) {
135+
d.invalidMetricsUsageQueue <- usages
136+
}
137+
109138
func (d *db) EnqueueLabels(labels map[string][]string) {
110139
d.labelsQueue <- labels
111140
}
112141

113142
func (d *db) watchMetricsQueue() {
114-
for _metrics := range d.metricsQueue {
115-
d.mutex.Lock()
116-
for _, metricName := range _metrics {
143+
for metricsName := range d.metricsQueue {
144+
d.metricsMutex.Lock()
145+
for _, metricName := range metricsName {
117146
if _, ok := d.metrics[metricName]; !ok {
118147
// As this queue only serves the purpose of storing missing metrics, we are only looking for the one not already present in the database.
119148
d.metrics[metricName] = &v1.Metric{}
@@ -125,13 +154,29 @@ func (d *db) watchMetricsQueue() {
125154
}
126155
}
127156
}
128-
d.mutex.Unlock()
157+
d.metricsMutex.Unlock()
158+
}
159+
}
160+
161+
func (d *db) watchInvalidMetricsUsageQueue() {
162+
for data := range d.invalidMetricsUsageQueue {
163+
d.invalidMetricsUsageMutex.Lock()
164+
for metricName, usage := range data {
165+
if _, ok := d.invalidMetrics[metricName]; !ok {
166+
d.invalidMetrics[metricName] = &v1.Metric{
167+
Usage: usage,
168+
}
169+
} else {
170+
d.invalidMetrics[metricName].Usage = mergeUsage(d.invalidMetrics[metricName].Usage, usage)
171+
}
172+
}
173+
d.invalidMetricsUsageMutex.Unlock()
129174
}
130175
}
131176

132177
func (d *db) watchUsageQueue() {
133178
for data := range d.usageQueue {
134-
d.mutex.Lock()
179+
d.metricsMutex.Lock()
135180
for metricName, usage := range data {
136181
if _, ok := d.metrics[metricName]; !ok {
137182
logrus.Debugf("metric_name %q is used but it's not found by the metric collector", metricName)
@@ -148,13 +193,13 @@ func (d *db) watchUsageQueue() {
148193
d.metrics[metricName].Usage = mergeUsage(d.metrics[metricName].Usage, usage)
149194
}
150195
}
151-
d.mutex.Unlock()
196+
d.metricsMutex.Unlock()
152197
}
153198
}
154199

155200
func (d *db) watchLabelsQueue() {
156201
for data := range d.labelsQueue {
157-
d.mutex.Lock()
202+
d.metricsMutex.Lock()
158203
for metricName, labels := range data {
159204
if _, ok := d.metrics[metricName]; !ok {
160205
// In this case, we should add the metric, because it means the metrics has been found from another source.
@@ -165,7 +210,7 @@ func (d *db) watchLabelsQueue() {
165210
d.metrics[metricName].Labels = utils.Merge(d.metrics[metricName].Labels, labels)
166211
}
167212
}
168-
d.mutex.Unlock()
213+
d.metricsMutex.Unlock()
169214
}
170215
}
171216

@@ -180,8 +225,8 @@ func (d *db) flush(period time.Duration) {
180225
}
181226

182227
func (d *db) writeMetricsInJSONFile() error {
183-
d.mutex.Lock()
184-
defer d.mutex.Unlock()
228+
d.metricsMutex.Lock()
229+
defer d.metricsMutex.Unlock()
185230
data, err := json.Marshal(d.metrics)
186231
if err != nil {
187232
return err

go.mod

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ go 1.23.1
44

55
require (
66
github.com/go-openapi/strfmt v0.23.0
7-
github.com/grafana/grafana-openapi-client-go v0.0.0-20241101140420-bc381928ae6e
7+
github.com/grafana/grafana-openapi-client-go v0.0.0-20241113095943-9cb2bbfeb8a3
88
github.com/labstack/echo/v4 v4.12.0
99
github.com/lithammer/fuzzysearch v1.1.8
1010
github.com/perses/common v0.26.0
1111
github.com/perses/perses v0.49.0
1212
github.com/prometheus/client_golang v1.20.5
1313
github.com/prometheus/common v0.60.1
14-
github.com/prometheus/prometheus v0.55.1
14+
github.com/prometheus/prometheus v0.300.0
1515
github.com/sirupsen/logrus v1.9.3
1616
github.com/stretchr/testify v1.9.0
1717
golang.org/x/oauth2 v0.24.0
@@ -45,8 +45,6 @@ require (
4545
github.com/go-git/go-billy/v5 v5.5.0 // indirect
4646
github.com/go-git/go-git/v5 v5.12.0 // indirect
4747
github.com/go-jose/go-jose/v4 v4.0.4 // indirect
48-
github.com/go-kit/log v0.2.1 // indirect
49-
github.com/go-logfmt/logfmt v0.6.0 // indirect
5048
github.com/go-logr/logr v1.4.2 // indirect
5149
github.com/go-logr/stdr v1.2.2 // indirect
5250
github.com/go-openapi/analysis v0.23.0 // indirect
@@ -75,7 +73,7 @@ require (
7573
github.com/jpillora/backoff v1.0.0 // indirect
7674
github.com/json-iterator/go v1.1.12 // indirect
7775
github.com/kevinburke/ssh_config v1.2.0 // indirect
78-
github.com/klauspost/compress v1.17.9 // indirect
76+
github.com/klauspost/compress v1.17.10 // indirect
7977
github.com/labstack/gommon v0.4.2 // indirect
8078
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
8179
github.com/mailru/easyjson v0.7.7 // indirect
@@ -111,13 +109,13 @@ require (
111109
github.com/zitadel/schema v1.3.0 // indirect
112110
gitlab.com/digitalxero/go-conventional-commit v1.0.7 // indirect
113111
go.mongodb.org/mongo-driver v1.14.0 // indirect
114-
go.opentelemetry.io/otel v1.29.0 // indirect
115-
go.opentelemetry.io/otel/metric v1.29.0 // indirect
116-
go.opentelemetry.io/otel/sdk v1.29.0 // indirect
117-
go.opentelemetry.io/otel/trace v1.29.0 // indirect
112+
go.opentelemetry.io/otel v1.31.0 // indirect
113+
go.opentelemetry.io/otel/metric v1.31.0 // indirect
114+
go.opentelemetry.io/otel/sdk v1.30.0 // indirect
115+
go.opentelemetry.io/otel/trace v1.31.0 // indirect
118116
go.uber.org/atomic v1.11.0 // indirect
119117
golang.org/x/crypto v0.28.0 // indirect
120-
golang.org/x/net v0.29.0 // indirect
118+
golang.org/x/net v0.30.0 // indirect
121119
golang.org/x/sync v0.8.0 // indirect
122120
golang.org/x/sys v0.26.0 // indirect
123121
golang.org/x/text v0.19.0 // indirect

0 commit comments

Comments
 (0)