Skip to content

Commit 5bd422f

Browse files
authored
Merge pull request #3649 from xinau/xinau/add-psi-metrics
Add Pressure Stall Information Metrics
2 parents 5b64902 + 20e5af2 commit 5bd422f

File tree

9 files changed

+207
-2
lines changed

9 files changed

+207
-2
lines changed

cmd/cadvisor_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ func TestToIncludedMetrics(t *testing.T) {
112112
container.ResctrlMetrics: struct{}{},
113113
container.CPUSetMetrics: struct{}{},
114114
container.OOMMetrics: struct{}{},
115+
container.PressureMetrics: struct{}{},
115116
},
116117
container.AllMetrics,
117118
{},

container/factory.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ const (
6666
ResctrlMetrics MetricKind = "resctrl"
6767
CPUSetMetrics MetricKind = "cpuset"
6868
OOMMetrics MetricKind = "oom_event"
69+
PressureMetrics MetricKind = "pressure"
6970
)
7071

7172
// AllMetrics represents all kinds of metrics that cAdvisor supported.
@@ -91,6 +92,7 @@ var AllMetrics = MetricSet{
9192
ResctrlMetrics: struct{}{},
9293
CPUSetMetrics: struct{}{},
9394
OOMMetrics: struct{}{},
95+
PressureMetrics: struct{}{},
9496
}
9597

9698
// AllNetworkMetrics represents all network metrics that cAdvisor supports.

container/libcontainer/handler.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,7 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
771771
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
772772
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
773773
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
774+
setPSIStats(s.CpuStats.PSI, &ret.Cpu.PSI)
774775

775776
if !withPerCPU {
776777
return
@@ -792,13 +793,15 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
792793
ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive)
793794
ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive)
794795
ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive)
796+
setPSIStats(s.BlkioStats.PSI, &ret.DiskIo.PSI)
795797
}
796798

797799
func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
798800
ret.Memory.Usage = s.MemoryStats.Usage.Usage
799801
ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
800802
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
801803
ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage
804+
setPSIStats(s.MemoryStats.PSI, &ret.Memory.PSI)
802805

803806
if cgroups.IsCgroup2UnifiedMode() {
804807
ret.Memory.Cache = s.MemoryStats.Stats["file"]
@@ -884,6 +887,22 @@ func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) {
884887
}
885888
}
886889

890+
func setPSIData(d *cgroups.PSIData, ret *info.PSIData) {
891+
if d != nil {
892+
ret.Total = d.Total
893+
ret.Avg10 = d.Avg10
894+
ret.Avg60 = d.Avg60
895+
ret.Avg300 = d.Avg300
896+
}
897+
}
898+
899+
func setPSIStats(s *cgroups.PSIStats, ret *info.PSIStats) {
900+
if s != nil {
901+
setPSIData(&s.Full, &ret.Full)
902+
setPSIData(&s.Some, &ret.Some)
903+
}
904+
}
905+
887906
// read from pids path not cpu
888907
func setThreadsStats(s *cgroups.Stats, ret *info.ContainerStats) {
889908
if s != nil {

container/libcontainer/handler_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,20 @@ func TestSetCPUStats(t *testing.T) {
110110
UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks,
111111
UsageInUsermode: 2767637 * nanosecondsInSeconds / clockTicks,
112112
},
113+
PSI: &cgroups.PSIStats{
114+
Full: cgroups.PSIData{
115+
Avg10: 0.3,
116+
Avg60: 0.2,
117+
Avg300: 0.1,
118+
Total: 100,
119+
},
120+
Some: cgroups.PSIData{
121+
Avg10: 0.6,
122+
Avg60: 0.4,
123+
Avg300: 0.2,
124+
Total: 200,
125+
},
126+
},
113127
},
114128
}
115129
var ret info.ContainerStats
@@ -123,6 +137,20 @@ func TestSetCPUStats(t *testing.T) {
123137
System: s.CpuStats.CpuUsage.UsageInKernelmode,
124138
Total: 33802947350272,
125139
},
140+
PSI: info.PSIStats{
141+
Full: info.PSIData{
142+
Avg10: 0.3,
143+
Avg60: 0.2,
144+
Avg300: 0.1,
145+
Total: 100,
146+
},
147+
Some: info.PSIData{
148+
Avg10: 0.6,
149+
Avg60: 0.4,
150+
Avg300: 0.2,
151+
Total: 200,
152+
},
153+
},
126154
},
127155
}
128156

info/v1/container.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,26 @@ func (ci *ContainerInfo) StatsEndTime() time.Time {
261261
return ret
262262
}
263263

264+
// PSI statistics for an individual resource.
265+
type PSIStats struct {
266+
// PSI data for all tasks of in the cgroup.
267+
Full PSIData `json:"full,omitempty"`
268+
// PSI data for some tasks in the cgroup.
269+
Some PSIData `json:"some,omitempty"`
270+
}
271+
272+
type PSIData struct {
273+
// Total time duration for tasks in the cgroup have waited due to congestion.
274+
// Unit: nanoseconds.
275+
Total uint64 `json:"total"`
276+
// The average (in %) tasks have waited due to congestion over a 10 second window.
277+
Avg10 float64 `json:"avg10"`
278+
// The average (in %) tasks have waited due to congestion over a 60 second window.
279+
Avg60 float64 `json:"avg60"`
280+
// The average (in %) tasks have waited due to congestion over a 300 second window.
281+
Avg300 float64 `json:"avg300"`
282+
}
283+
264284
// This mirrors kernel internal structure.
265285
type LoadStats struct {
266286
// Number of sleeping tasks.
@@ -334,7 +354,8 @@ type CpuStats struct {
334354
// from LoadStats.NrRunning.
335355
LoadAverage int32 `json:"load_average"`
336356
// from LoadStats.NrUninterruptible
337-
LoadDAverage int32 `json:"load_d_average"`
357+
LoadDAverage int32 `json:"load_d_average"`
358+
PSI PSIStats `json:"psi"`
338359
}
339360

340361
type PerDiskStats struct {
@@ -353,6 +374,7 @@ type DiskIoStats struct {
353374
IoWaitTime []PerDiskStats `json:"io_wait_time,omitempty"`
354375
IoMerged []PerDiskStats `json:"io_merged,omitempty"`
355376
IoTime []PerDiskStats `json:"io_time,omitempty"`
377+
PSI PSIStats `json:"psi"`
356378
}
357379

358380
type HugetlbStats struct {
@@ -411,6 +433,8 @@ type MemoryStats struct {
411433

412434
ContainerData MemoryStatsMemoryData `json:"container_data,omitempty"`
413435
HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"`
436+
437+
PSI PSIStats `json:"psi"`
414438
}
415439

416440
type CPUSetStats struct {

metrics/prometheus.go

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,14 @@ import (
3333
// asFloat64 converts a uint64 into a float64.
3434
func asFloat64(v uint64) float64 { return float64(v) }
3535

36+
// asMicrosecondsToSeconds converts nanoseconds into a float64 representing seconds.
37+
func asMicrosecondsToSeconds(v uint64) float64 {
38+
return float64(v) / 1e6
39+
}
40+
3641
// asNanosecondsToSeconds converts nanoseconds into a float64 representing seconds.
3742
func asNanosecondsToSeconds(v uint64) float64 {
38-
return float64(v) / float64(time.Second)
43+
return float64(v) / 1e9
3944
}
4045

4146
// fsValues is a helper method for assembling per-filesystem stats.
@@ -1746,6 +1751,54 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
17461751
})
17471752
}
17481753

1754+
if includedMetrics.Has(container.PressureMetrics) {
1755+
c.containerMetrics = append(c.containerMetrics, []containerMetric{
1756+
{
1757+
name: "container_pressure_cpu_stalled_seconds_total",
1758+
help: "Total time duration no tasks in the container could make progress due to CPU congestion.",
1759+
valueType: prometheus.CounterValue,
1760+
getValues: func(s *info.ContainerStats) metricValues {
1761+
return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Full.Total), timestamp: s.Timestamp}}
1762+
},
1763+
}, {
1764+
name: "container_pressure_cpu_waiting_seconds_total",
1765+
help: "Total time duration tasks in the container have waited due to CPU congestion.",
1766+
valueType: prometheus.CounterValue,
1767+
getValues: func(s *info.ContainerStats) metricValues {
1768+
return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Some.Total), timestamp: s.Timestamp}}
1769+
},
1770+
}, {
1771+
name: "container_pressure_memory_stalled_seconds_total",
1772+
help: "Total time duration no tasks in the container could make progress due to memory congestion.",
1773+
valueType: prometheus.CounterValue,
1774+
getValues: func(s *info.ContainerStats) metricValues {
1775+
return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Full.Total), timestamp: s.Timestamp}}
1776+
},
1777+
}, {
1778+
name: "container_pressure_memory_waiting_seconds_total",
1779+
help: "Total time duration tasks in the container have waited due to memory congestion.",
1780+
valueType: prometheus.CounterValue,
1781+
getValues: func(s *info.ContainerStats) metricValues {
1782+
return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Some.Total), timestamp: s.Timestamp}}
1783+
},
1784+
}, {
1785+
name: "container_pressure_io_stalled_seconds_total",
1786+
help: "Total time duration no tasks in the container could make progress due to IO congestion.",
1787+
valueType: prometheus.CounterValue,
1788+
getValues: func(s *info.ContainerStats) metricValues {
1789+
return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Full.Total), timestamp: s.Timestamp}}
1790+
},
1791+
}, {
1792+
name: "container_pressure_io_waiting_seconds_total",
1793+
help: "Total time duration tasks in the container have waited due to IO congestion.",
1794+
valueType: prometheus.CounterValue,
1795+
getValues: func(s *info.ContainerStats) metricValues {
1796+
return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Some.Total), timestamp: s.Timestamp}}
1797+
},
1798+
},
1799+
}...)
1800+
}
1801+
17491802
return c
17501803
}
17511804

metrics/prometheus_fake.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
328328
},
329329
LoadAverage: 2,
330330
LoadDAverage: 2,
331+
PSI: info.PSIStats{
332+
Full: info.PSIData{
333+
Avg10: 0.3,
334+
Avg60: 0.2,
335+
Avg300: 0.1,
336+
Total: 100,
337+
},
338+
Some: info.PSIData{
339+
Avg10: 0.6,
340+
Avg60: 0.4,
341+
Avg300: 0.2,
342+
Total: 200,
343+
},
344+
},
331345
},
332346
Memory: info.MemoryStats{
333347
Usage: 8,
@@ -358,6 +372,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
358372
MappedFile: 16,
359373
KernelUsage: 17,
360374
Swap: 8192,
375+
PSI: info.PSIStats{
376+
Full: info.PSIData{
377+
Avg10: 0.3,
378+
Avg60: 0.2,
379+
Avg300: 0.1,
380+
Total: 1000,
381+
},
382+
Some: info.PSIData{
383+
Avg10: 0.6,
384+
Avg60: 0.4,
385+
Avg300: 0.2,
386+
Total: 2000,
387+
},
388+
},
361389
},
362390
Hugetlb: map[string]info.HugetlbStats{
363391
"2Mi": {
@@ -550,6 +578,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
550578
"Write": 6,
551579
},
552580
}},
581+
PSI: info.PSIStats{
582+
Full: info.PSIData{
583+
Avg10: 0.3,
584+
Avg60: 0.2,
585+
Avg300: 0.1,
586+
Total: 1100,
587+
},
588+
Some: info.PSIData{
589+
Avg10: 0.6,
590+
Avg60: 0.4,
591+
Avg300: 0.2,
592+
Total: 2200,
593+
},
594+
},
553595
},
554596
Filesystem: []info.FsStats{
555597
{

metrics/testdata/prometheus_metrics

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,24 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_
381381
# TYPE container_perf_uncore_events_scaling_ratio gauge
382382
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
383383
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
384+
# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion.
385+
# TYPE container_pressure_cpu_stalled_seconds_total counter
386+
container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
387+
# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion.
388+
# TYPE container_pressure_cpu_waiting_seconds_total counter
389+
container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
390+
# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion.
391+
# TYPE container_pressure_io_stalled_seconds_total counter
392+
container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000
393+
# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion.
394+
# TYPE container_pressure_io_waiting_seconds_total counter
395+
container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000
396+
# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion.
397+
# TYPE container_pressure_memory_stalled_seconds_total counter
398+
container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000
399+
# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion.
400+
# TYPE container_pressure_memory_waiting_seconds_total counter
401+
container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
384402
# HELP container_processes Number of processes running inside the container.
385403
# TYPE container_processes gauge
386404
container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000

metrics/testdata/prometheus_metrics_whitelist_filtered

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,24 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count
381381
# TYPE container_perf_uncore_events_scaling_ratio gauge
382382
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
383383
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
384+
# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion.
385+
# TYPE container_pressure_cpu_stalled_seconds_total counter
386+
container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
387+
# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion.
388+
# TYPE container_pressure_cpu_waiting_seconds_total counter
389+
container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
390+
# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion.
391+
# TYPE container_pressure_io_stalled_seconds_total counter
392+
container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000
393+
# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion.
394+
# TYPE container_pressure_io_waiting_seconds_total counter
395+
container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000
396+
# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion.
397+
# TYPE container_pressure_memory_stalled_seconds_total counter
398+
container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000
399+
# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion.
400+
# TYPE container_pressure_memory_waiting_seconds_total counter
401+
container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
384402
# HELP container_processes Number of processes running inside the container.
385403
# TYPE container_processes gauge
386404
container_processes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000

0 commit comments

Comments
 (0)