Skip to content

Commit 6b23ac7

Browse files
dqminhFelix Ehrenpfort
authored and
Felix Ehrenpfort
committed
Expose PSI metrics with prometheus
This adds support for reading PSI metrics via prometheus. We exposes the following for `psi_total`: ``` container_cpu_psi_total_seconds container_memory_psi_total_seconds container_io_psi_total_seconds ``` And for `psi_avg`: ``` container_cpu_psi_avg10_ratio container_cpu_psi_avg60_ratio container_cpu_psi_avg300_ratio container_memory_psi_avg10_ratio container_memory_psi_avg60_ratio container_memory_psi_avg300_ratio container_io_psi_avg10_ratio container_io_psi_avg60_ratio container_io_psi_avg300_ratio ``` Signed-off-by: Daniel Dao <[email protected]>
1 parent d3fefb9 commit 6b23ac7

File tree

4 files changed

+216
-0
lines changed

4 files changed

+216
-0
lines changed

Diff for: metrics/prometheus.go

+78
Original file line numberDiff line numberDiff line change
@@ -1746,6 +1746,64 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
17461746
})
17471747
}
17481748

1749+
if includedMetrics.Has(container.PSITotalMetrics) {
1750+
c.containerMetrics = append(c.containerMetrics, []containerMetric{
1751+
{
1752+
name: "container_cpu_psi_total_seconds",
1753+
help: "Total time spent under cpu pressure in seconds.",
1754+
valueType: prometheus.CounterValue,
1755+
extraLabels: []string{"kind"},
1756+
getValues: func(s *info.ContainerStats) metricValues {
1757+
return getPSIValues(s, &s.Cpu.PSI, "total")
1758+
},
1759+
}, {
1760+
name: "container_memory_psi_total_seconds",
1761+
help: "Total container time spent under memory pressure in seconds.",
1762+
valueType: prometheus.CounterValue,
1763+
extraLabels: []string{"kind"},
1764+
getValues: func(s *info.ContainerStats) metricValues {
1765+
return getPSIValues(s, &s.Memory.PSI, "total")
1766+
},
1767+
}, {
1768+
name: "container_io_psi_total_seconds",
1769+
help: "Total time spent under io pressure in seconds.",
1770+
valueType: prometheus.CounterValue,
1771+
extraLabels: []string{"kind"},
1772+
getValues: func(s *info.ContainerStats) metricValues {
1773+
return getPSIValues(s, &s.DiskIo.PSI, "total")
1774+
},
1775+
},
1776+
}...)
1777+
}
1778+
1779+
if includedMetrics.Has(container.PSIAvgMetrics) {
1780+
makePSIAvgMetric := func(controller, window string) containerMetric {
1781+
return containerMetric{
1782+
name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window),
1783+
help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window),
1784+
valueType: prometheus.GaugeValue,
1785+
extraLabels: []string{"kind"},
1786+
getValues: func(s *info.ContainerStats) metricValues {
1787+
switch controller {
1788+
case "cpu":
1789+
return getPSIValues(s, &s.Cpu.PSI, "avg"+window)
1790+
case "memory":
1791+
return getPSIValues(s, &s.Memory.PSI, "avg"+window)
1792+
case "io":
1793+
return getPSIValues(s, &s.DiskIo.PSI, "avg"+window)
1794+
default:
1795+
return nil
1796+
}
1797+
},
1798+
}
1799+
}
1800+
for _, controller := range []string{"cpu", "memory", "io"} {
1801+
for _, window := range []string{"10", "60", "300"} {
1802+
c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window))
1803+
}
1804+
}
1805+
}
1806+
17491807
return c
17501808
}
17511809

@@ -2038,3 +2096,23 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues {
20382096
}
20392097
return values
20402098
}
2099+
2100+
func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues {
2101+
v := make(metricValues, 0, 2)
2102+
switch psiMetric {
2103+
case "avg10":
2104+
v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}})
2105+
v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}})
2106+
case "avg60":
2107+
v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}})
2108+
v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}})
2109+
case "avg300":
2110+
v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}})
2111+
v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}})
2112+
case "total":
2113+
// total is measured as microseconds
2114+
v = append(v, metricValue{value: float64(time.Duration(psi.Some.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"some"}})
2115+
v = append(v, metricValue{value: float64(time.Duration(psi.Full.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"full"}})
2116+
}
2117+
return v
2118+
}

Diff for: metrics/prometheus_fake.go

+42
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
328328
},
329329
LoadAverage: 2,
330330
LoadDAverage: 2,
331+
PSI: info.PSIStats{
332+
Some: info.PSIData{
333+
Avg10: 0.1,
334+
Avg60: 0.2,
335+
Avg300: 0.3,
336+
Total: 100,
337+
},
338+
Full: info.PSIData{
339+
Avg10: 0.4,
340+
Avg60: 0.5,
341+
Avg300: 0.6,
342+
Total: 200,
343+
},
344+
},
331345
},
332346
Memory: info.MemoryStats{
333347
Usage: 8,
@@ -358,6 +372,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
358372
MappedFile: 16,
359373
KernelUsage: 17,
360374
Swap: 8192,
375+
PSI: info.PSIStats{
376+
Some: info.PSIData{
377+
Avg10: 0.01,
378+
Avg60: 0.02,
379+
Avg300: 0.03,
380+
Total: 1000,
381+
},
382+
Full: info.PSIData{
383+
Avg10: 0.04,
384+
Avg60: 0.05,
385+
Avg300: 0.06,
386+
Total: 2000,
387+
},
388+
},
361389
},
362390
Hugetlb: map[string]info.HugetlbStats{
363391
"2Mi": {
@@ -550,6 +578,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
550578
"Write": 6,
551579
},
552580
}},
581+
PSI: info.PSIStats{
582+
Some: info.PSIData{
583+
Avg10: 0.11,
584+
Avg60: 0.12,
585+
Avg300: 0.13,
586+
Total: 1111,
587+
},
588+
Full: info.PSIData{
589+
Avg10: 0.14,
590+
Avg60: 0.15,
591+
Avg300: 0.16,
592+
Total: 2222,
593+
},
594+
},
553595
},
554596
Filesystem: []info.FsStats{
555597
{

Diff for: metrics/testdata/prometheus_metrics

+48
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo
433433
# TYPE container_memory_bandwidth_local_bytes gauge
434434
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
435435
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
436+
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
437+
# TYPE container_cpu_psi_avg10_ratio gauge
438+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
439+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
440+
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
441+
# TYPE container_cpu_psi_avg300_ratio gauge
442+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
443+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
444+
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
445+
# TYPE container_cpu_psi_avg60_ratio gauge
446+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
447+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
448+
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
449+
# TYPE container_cpu_psi_total_seconds counter
450+
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
451+
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
452+
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
453+
# TYPE container_io_psi_avg10_ratio gauge
454+
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
455+
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
456+
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
457+
# TYPE container_io_psi_avg300_ratio gauge
458+
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
459+
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
460+
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
461+
# TYPE container_io_psi_avg60_ratio gauge
462+
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
463+
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
464+
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
465+
# TYPE container_io_psi_total_seconds counter
466+
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000
467+
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000
468+
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
469+
# TYPE container_memory_psi_avg10_ratio gauge
470+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
471+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
472+
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
473+
# TYPE container_memory_psi_avg300_ratio gauge
474+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
475+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
476+
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
477+
# TYPE container_memory_psi_avg60_ratio gauge
478+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
479+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
480+
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
481+
# TYPE container_memory_psi_total_seconds counter
482+
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
483+
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000

Diff for: metrics/testdata/prometheus_metrics_whitelist_filtered

+48
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer"
433433
# TYPE container_memory_bandwidth_local_bytes gauge
434434
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
435435
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
436+
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
437+
# TYPE container_cpu_psi_avg10_ratio gauge
438+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
439+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
440+
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
441+
# TYPE container_cpu_psi_avg300_ratio gauge
442+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
443+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
444+
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
445+
# TYPE container_cpu_psi_avg60_ratio gauge
446+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
447+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
448+
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
449+
# TYPE container_cpu_psi_total_seconds counter
450+
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
451+
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
452+
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
453+
# TYPE container_io_psi_avg10_ratio gauge
454+
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
455+
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
456+
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
457+
# TYPE container_io_psi_avg300_ratio gauge
458+
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
459+
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
460+
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
461+
# TYPE container_io_psi_avg60_ratio gauge
462+
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
463+
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
464+
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
465+
# TYPE container_io_psi_total_seconds counter
466+
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000
467+
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000
468+
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
469+
# TYPE container_memory_psi_avg10_ratio gauge
470+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
471+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
472+
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
473+
# TYPE container_memory_psi_avg300_ratio gauge
474+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
475+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
476+
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
477+
# TYPE container_memory_psi_avg60_ratio gauge
478+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
479+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
480+
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
481+
# TYPE container_memory_psi_total_seconds counter
482+
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
483+
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000

0 commit comments

Comments
 (0)