Skip to content

Commit 8b41ec5

Browse files
committed
Add pressure stall information metrics
issues: #3052, #3083, kubernetes/enhancements#4205 This change adds metrics for pressure stall information, that indicate why some or all tasks of a cgroupv2 have waited due to resource congestion (cpu, memory, io). The change exposes this information by including the _PSIStats_ of each controller in it's stats, i.e. _CPUStats.PSI_, _MemoryStats.PSI_ and _DiskStats.PSI_. The information is additionally exposed as Prometheus metrics. The metrics follow the naming outlined by the prometheus/node-exporter, where stalled eq full and waiting eq some. ``` container_pressure_cpu_stalled_seconds_total container_pressure_cpu_waiting_seconds_total container_pressure_memory_stalled_seconds_total container_pressure_memory_waiting_seconds_total container_pressure_io_stalled_seconds_total container_pressure_io_waiting_seconds_total ``` Signed-off-by: Felix Ehrenpfort <[email protected]>
1 parent 6b23ac7 commit 8b41ec5

File tree

10 files changed

+175
-265
lines changed

10 files changed

+175
-265
lines changed

cmd/cadvisor_test.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,7 @@ func TestToIncludedMetrics(t *testing.T) {
112112
container.ResctrlMetrics: struct{}{},
113113
container.CPUSetMetrics: struct{}{},
114114
container.OOMMetrics: struct{}{},
115-
container.PSITotalMetrics: struct{}{},
116-
container.PSIAvgMetrics: struct{}{},
115+
container.PressureMetrics: struct{}{},
117116
},
118117
container.AllMetrics,
119118
{},

cmd/go.mod

-1
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,3 @@ require (
130130
gopkg.in/yaml.v3 v3.0.1 // indirect
131131
)
132132

133-
replace github.com/opencontainers/runc => github.com/dqminh/runc v0.0.0-20220513155811-6414629ada8a

container/factory.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,7 @@ const (
6666
ResctrlMetrics MetricKind = "resctrl"
6767
CPUSetMetrics MetricKind = "cpuset"
6868
OOMMetrics MetricKind = "oom_event"
69-
PSITotalMetrics MetricKind = "psi_total"
70-
PSIAvgMetrics MetricKind = "psi_avg"
69+
PressureMetrics MetricKind = "pressure"
7170
)
7271

7372
// AllMetrics represents all kinds of metrics that cAdvisor supported.
@@ -93,8 +92,7 @@ var AllMetrics = MetricSet{
9392
ResctrlMetrics: struct{}{},
9493
CPUSetMetrics: struct{}{},
9594
OOMMetrics: struct{}{},
96-
PSITotalMetrics: struct{}{},
97-
PSIAvgMetrics: struct{}{},
95+
PressureMetrics: struct{}{},
9896
}
9997

10098
// AllNetworkMetrics represents all network metrics that cAdvisor supports.

container/libcontainer/handler.go

+15-20
Original file line numberDiff line numberDiff line change
@@ -763,20 +763,6 @@ func (h *Handler) GetProcesses() ([]int, error) {
763763
return pids, nil
764764
}
765765

766-
// Convert libcontainer cgroups.PSIData to info.PSIData
767-
func convertPSIData(from *cgroups.PSIData, to *info.PSIData) {
768-
to.Avg10 = from.Avg10
769-
to.Avg60 = from.Avg60
770-
to.Avg300 = from.Avg300
771-
to.Total = from.Total
772-
}
773-
774-
// Convert libcontainer cgroups.PSIStats to info.PSIStats
775-
func convertPSI(from *cgroups.PSIStats, to *info.PSIStats) {
776-
convertPSIData(&from.Some, &to.Some)
777-
convertPSIData(&from.Full, &to.Full)
778-
}
779-
780766
// Convert libcontainer stats to info.ContainerStats.
781767
func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
782768
ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode
@@ -786,8 +772,6 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
786772
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
787773
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
788774

789-
convertPSI(&s.CpuStats.PSI, &ret.Cpu.PSI)
790-
791775
if !withPerCPU {
792776
return
793777
}
@@ -808,8 +792,6 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
808792
ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive)
809793
ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive)
810794
ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive)
811-
812-
convertPSI(&s.BlkioStats.PSI, &ret.DiskIo.PSI)
813795
}
814796

815797
func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
@@ -818,8 +800,6 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
818800
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
819801
ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage
820802

821-
convertPSI(&s.MemoryStats.PSI, &ret.Memory.PSI)
822-
823803
if cgroups.IsCgroup2UnifiedMode() {
824804
ret.Memory.Cache = s.MemoryStats.Stats["file"]
825805
ret.Memory.RSS = s.MemoryStats.Stats["anon"]
@@ -904,6 +884,18 @@ func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) {
904884
}
905885
}
906886

887+
func getPressureData(s *cgroups.PSIStats) (ret info.PressureData) {
888+
ret.Full = s.Full.Total
889+
ret.Some = s.Some.Total
890+
return
891+
}
892+
893+
func setPressureStats(s *cgroups.Stats, ret *info.ContainerStats) {
894+
ret.Pressure.CPU = getPressureData(s.CpuStats.PSI)
895+
ret.Pressure.Memory = getPressureData(s.MemoryStats.PSI)
896+
ret.Pressure.Io = getPressureData(s.BlkioStats.PSI)
897+
}
898+
907899
// read from pids path not cpu
908900
func setThreadsStats(s *cgroups.Stats, ret *info.ContainerStats) {
909901
if s != nil {
@@ -932,6 +924,9 @@ func newContainerStats(cgroupStats *cgroups.Stats, includedMetrics container.Met
932924
if includedMetrics.Has(container.CPUSetMetrics) {
933925
setCPUSetStats(s, ret)
934926
}
927+
if includedMetrics.Has(container.PressureMetrics) {
928+
setPressureStats(s, ret)
929+
}
935930
}
936931
return ret
937932
}

container/libcontainer/handler_test.go

+62-22
Original file line numberDiff line numberDiff line change
@@ -110,20 +110,6 @@ func TestSetCPUStats(t *testing.T) {
110110
UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks,
111111
UsageInUsermode: 2767637 * nanosecondsInSeconds / clockTicks,
112112
},
113-
PSI: cgroups.PSIStats{
114-
Some: cgroups.PSIData{
115-
Avg10: 0.1,
116-
Avg60: 0.2,
117-
Avg300: 0.3,
118-
Total: 100,
119-
},
120-
Full: cgroups.PSIData{
121-
Avg10: 0.4,
122-
Avg60: 0.5,
123-
Avg300: 0.6,
124-
Total: 200,
125-
},
126-
},
127113
},
128114
}
129115
var ret info.ContainerStats
@@ -137,21 +123,75 @@ func TestSetCPUStats(t *testing.T) {
137123
System: s.CpuStats.CpuUsage.UsageInKernelmode,
138124
Total: 33802947350272,
139125
},
140-
PSI: info.PSIStats{
141-
Some: info.PSIData{
142-
Avg10: 0.1,
126+
},
127+
}
128+
129+
if !ret.Eq(&expected) {
130+
t.Fatalf("expected %+v == %+v", ret, expected)
131+
}
132+
}
133+
134+
func TestSetPressureStats(t *testing.T) {
135+
s := &cgroups.Stats{
136+
CpuStats: cgroups.CpuStats{
137+
PSI: &cgroups.PSIStats{
138+
Full: cgroups.PSIData{
139+
Avg10: 0.3,
143140
Avg60: 0.2,
144-
Avg300: 0.3,
141+
Avg300: 0.1,
145142
Total: 100,
146143
},
147-
Full: info.PSIData{
148-
Avg10: 0.4,
149-
Avg60: 0.5,
150-
Avg300: 0.6,
144+
Some: cgroups.PSIData{
145+
Avg10: 0.6,
146+
Avg60: 0.4,
147+
Avg300: 0.2,
151148
Total: 200,
152149
},
153150
},
154151
},
152+
MemoryStats: cgroups.MemoryStats{
153+
PSI: &cgroups.PSIStats{
154+
Full: cgroups.PSIData{
155+
Avg10: 0.3,
156+
Avg60: 0.2,
157+
Avg300: 0.1,
158+
Total: 1000,
159+
},
160+
Some: cgroups.PSIData{
161+
Avg10: 0.6,
162+
Avg60: 0.4,
163+
Avg300: 0.2,
164+
Total: 2000,
165+
},
166+
},
167+
},
168+
BlkioStats: cgroups.BlkioStats{
169+
PSI: &cgroups.PSIStats{
170+
Full: cgroups.PSIData{
171+
Avg10: 0.3,
172+
Avg60: 0.2,
173+
Avg300: 0.1,
174+
Total: 1100,
175+
},
176+
Some: cgroups.PSIData{
177+
Avg10: 0.6,
178+
Avg60: 0.4,
179+
Avg300: 0.2,
180+
Total: 2200,
181+
},
182+
},
183+
},
184+
}
185+
186+
var ret info.ContainerStats
187+
setPressureStats(s, &ret)
188+
189+
expected := info.ContainerStats{
190+
Pressure: info.PressureStats{
191+
CPU: info.PressureData{},
192+
Memory: info.PressureData{},
193+
Io: info.PressureData{},
194+
},
155195
}
156196

157197
if !ret.Eq(&expected) {

info/v1/container.go

+11-14
Original file line numberDiff line numberDiff line change
@@ -261,16 +261,16 @@ func (ci *ContainerInfo) StatsEndTime() time.Time {
261261
return ret
262262
}
263263

264-
type PSIData struct {
265-
Avg10 float64 `json:"avg10"`
266-
Avg60 float64 `json:"avg60"`
267-
Avg300 float64 `json:"avg300"`
268-
Total uint64 `json:"total"`
264+
// Pressure stall information statistics.
265+
type PressureStats struct {
266+
CPU PressureData `json:"cpu,omitempty"`
267+
Memory PressureData `json:"memory,omitempty"`
268+
Io PressureData `json:"io,omitempty"`
269269
}
270270

271-
type PSIStats struct {
272-
Some PSIData `json:"some,omitempty"`
273-
Full PSIData `json:"full,omitempty"`
271+
type PressureData struct {
272+
Full uint64 `json:"full"`
273+
Some uint64 `json:"some"`
274274
}
275275

276276
// This mirrors kernel internal structure.
@@ -347,8 +347,6 @@ type CpuStats struct {
347347
LoadAverage int32 `json:"load_average"`
348348
// from LoadStats.NrUninterruptible
349349
LoadDAverage int32 `json:"load_d_average"`
350-
351-
PSI PSIStats `json:"psi,omitempty"`
352350
}
353351

354352
type PerDiskStats struct {
@@ -367,8 +365,6 @@ type DiskIoStats struct {
367365
IoWaitTime []PerDiskStats `json:"io_wait_time,omitempty"`
368366
IoMerged []PerDiskStats `json:"io_merged,omitempty"`
369367
IoTime []PerDiskStats `json:"io_time,omitempty"`
370-
371-
PSI PSIStats `json:"psi,omitempty"`
372368
}
373369

374370
type HugetlbStats struct {
@@ -427,8 +423,6 @@ type MemoryStats struct {
427423

428424
ContainerData MemoryStatsMemoryData `json:"container_data,omitempty"`
429425
HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"`
430-
431-
PSI PSIStats `json:"psi,omitempty"`
432426
}
433427

434428
type CPUSetStats struct {
@@ -997,6 +991,9 @@ type ContainerStats struct {
997991
CpuSet CPUSetStats `json:"cpuset,omitempty"`
998992

999993
OOMEvents uint64 `json:"oom_events,omitempty"`
994+
995+
// Pressure stall information statistics
996+
Pressure PressureStats `json:"pressure,omitempty"`
1000997
}
1001998

1002999
func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {

0 commit comments

Comments
 (0)