diff --git a/cmd/cadvisor_test.go b/cmd/cadvisor_test.go index 58461ae182..fc3a68966f 100644 --- a/cmd/cadvisor_test.go +++ b/cmd/cadvisor_test.go @@ -112,6 +112,7 @@ func TestToIncludedMetrics(t *testing.T) { container.ResctrlMetrics: struct{}{}, container.CPUSetMetrics: struct{}{}, container.OOMMetrics: struct{}{}, + container.PressureMetrics: struct{}{}, }, container.AllMetrics, {}, diff --git a/container/factory.go b/container/factory.go index c48a64e163..dfe6de6437 100644 --- a/container/factory.go +++ b/container/factory.go @@ -66,6 +66,7 @@ const ( ResctrlMetrics MetricKind = "resctrl" CPUSetMetrics MetricKind = "cpuset" OOMMetrics MetricKind = "oom_event" + PressureMetrics MetricKind = "pressure" ) // AllMetrics represents all kinds of metrics that cAdvisor supported. @@ -91,6 +92,7 @@ var AllMetrics = MetricSet{ ResctrlMetrics: struct{}{}, CPUSetMetrics: struct{}{}, OOMMetrics: struct{}{}, + PressureMetrics: struct{}{}, } // AllNetworkMetrics represents all network metrics that cAdvisor supports. diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go index 5bf1a4f997..ece7559613 100644 --- a/container/libcontainer/handler.go +++ b/container/libcontainer/handler.go @@ -771,6 +771,7 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) { ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime + setPSIStats(s.CpuStats.PSI, &ret.Cpu.PSI) if !withPerCPU { return @@ -792,6 +793,7 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive) ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive) ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive) + setPSIStats(s.BlkioStats.PSI, &ret.DiskIo.PSI) } func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { @@ -799,6 +801,7 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage + setPSIStats(s.MemoryStats.PSI, &ret.Memory.PSI) if cgroups.IsCgroup2UnifiedMode() { ret.Memory.Cache = s.MemoryStats.Stats["file"] @@ -884,6 +887,22 @@ func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) { } } +func setPSIData(d *cgroups.PSIData, ret *info.PSIData) { + if d != nil { + ret.Total = d.Total + ret.Avg10 = d.Avg10 + ret.Avg60 = d.Avg60 + ret.Avg300 = d.Avg300 + } +} + +func setPSIStats(s *cgroups.PSIStats, ret *info.PSIStats) { + if s != nil { + setPSIData(&s.Full, &ret.Full) + setPSIData(&s.Some, &ret.Some) + } +} + // read from pids path not cpu func setThreadsStats(s *cgroups.Stats, ret *info.ContainerStats) { if s != nil { diff --git a/container/libcontainer/handler_test.go b/container/libcontainer/handler_test.go index 82da0b3e67..a74fc09831 100644 --- a/container/libcontainer/handler_test.go +++ b/container/libcontainer/handler_test.go @@ -110,6 +110,20 @@ func TestSetCPUStats(t *testing.T) { UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks, UsageInUsermode: 2767637 * nanosecondsInSeconds / clockTicks, }, + PSI: &cgroups.PSIStats{ + Full: cgroups.PSIData{ + Avg10: 0.3, + Avg60: 0.2, + Avg300: 0.1, + Total: 100, + }, + Some: cgroups.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, + Total: 200, + }, + }, }, } var ret info.ContainerStats @@ -123,6 +137,20 @@ func TestSetCPUStats(t *testing.T) { System: s.CpuStats.CpuUsage.UsageInKernelmode, Total: 33802947350272, }, + PSI: info.PSIStats{ + Full: info.PSIData{ + Avg10: 0.3, + Avg60: 0.2, + Avg300: 0.1, + Total: 100, + }, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, + Total: 200, + }, + }, }, } diff --git a/info/v1/container.go b/info/v1/container.go index ae1d9caecc..5921783165 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -261,6 +261,26 @@ func (ci *ContainerInfo) StatsEndTime() time.Time { return ret } +// PSI statistics for an individual resource. +type PSIStats struct { + // PSI data for all tasks of in the cgroup. + Full PSIData `json:"full,omitempty"` + // PSI data for some tasks in the cgroup. + Some PSIData `json:"some,omitempty"` +} + +type PSIData struct { + // Total time duration for tasks in the cgroup have waited due to congestion. + // Unit: nanoseconds. + Total uint64 `json:"total"` + // The average (in %) tasks have waited due to congestion over a 10 second window. + Avg10 float64 `json:"avg10"` + // The average (in %) tasks have waited due to congestion over a 60 second window. + Avg60 float64 `json:"avg60"` + // The average (in %) tasks have waited due to congestion over a 300 second window. + Avg300 float64 `json:"avg300"` +} + // This mirrors kernel internal structure. type LoadStats struct { // Number of sleeping tasks. @@ -334,7 +354,8 @@ type CpuStats struct { // from LoadStats.NrRunning. LoadAverage int32 `json:"load_average"` // from LoadStats.NrUninterruptible - LoadDAverage int32 `json:"load_d_average"` + LoadDAverage int32 `json:"load_d_average"` + PSI PSIStats `json:"psi"` } type PerDiskStats struct { @@ -353,6 +374,7 @@ type DiskIoStats struct { IoWaitTime []PerDiskStats `json:"io_wait_time,omitempty"` IoMerged []PerDiskStats `json:"io_merged,omitempty"` IoTime []PerDiskStats `json:"io_time,omitempty"` + PSI PSIStats `json:"psi"` } type HugetlbStats struct { @@ -411,6 +433,8 @@ type MemoryStats struct { ContainerData MemoryStatsMemoryData `json:"container_data,omitempty"` HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"` + + PSI PSIStats `json:"psi"` } type CPUSetStats struct { diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 86064819d3..aa6d53ceeb 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -33,9 +33,14 @@ import ( // asFloat64 converts a uint64 into a float64. func asFloat64(v uint64) float64 { return float64(v) } +// asMicrosecondsToSeconds converts nanoseconds into a float64 representing seconds. +func asMicrosecondsToSeconds(v uint64) float64 { + return float64(v) / 1e6 +} + // asNanosecondsToSeconds converts nanoseconds into a float64 representing seconds. func asNanosecondsToSeconds(v uint64) float64 { - return float64(v) / float64(time.Second) + return float64(v) / 1e9 } // fsValues is a helper method for assembling per-filesystem stats. @@ -1746,6 +1751,54 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }) } + if includedMetrics.Has(container.PressureMetrics) { + c.containerMetrics = append(c.containerMetrics, []containerMetric{ + { + name: "container_pressure_cpu_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to CPU congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Full.Total), timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_cpu_waiting_seconds_total", + help: "Total time duration tasks in the container have waited due to CPU congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Some.Total), timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_memory_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to memory congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Full.Total), timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_memory_waiting_seconds_total", + help: "Total time duration tasks in the container have waited due to memory congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Some.Total), timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_io_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to IO congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Full.Total), timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_io_waiting_seconds_total", + help: "Total time duration tasks in the container have waited due to IO congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Some.Total), timestamp: s.Timestamp}} + }, + }, + }...) + } + return c } diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index fd43b78148..5e53a8d6de 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -328,6 +328,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req }, LoadAverage: 2, LoadDAverage: 2, + PSI: info.PSIStats{ + Full: info.PSIData{ + Avg10: 0.3, + Avg60: 0.2, + Avg300: 0.1, + Total: 100, + }, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, + Total: 200, + }, + }, }, Memory: info.MemoryStats{ Usage: 8, @@ -358,6 +372,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req MappedFile: 16, KernelUsage: 17, Swap: 8192, + PSI: info.PSIStats{ + Full: info.PSIData{ + Avg10: 0.3, + Avg60: 0.2, + Avg300: 0.1, + Total: 1000, + }, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, + Total: 2000, + }, + }, }, Hugetlb: map[string]info.HugetlbStats{ "2Mi": { @@ -550,6 +578,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req "Write": 6, }, }}, + PSI: info.PSIStats{ + Full: info.PSIData{ + Avg10: 0.3, + Avg60: 0.2, + Avg300: 0.1, + Total: 1100, + }, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, + Total: 2200, + }, + }, }, Filesystem: []info.FsStats{ { diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index a385e50689..b0dc5c444b 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -381,6 +381,24 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_ # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 +# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. +# TYPE container_pressure_cpu_waiting_seconds_total counter +container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 +# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion. +# TYPE container_pressure_io_stalled_seconds_total counter +container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000 +# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion. +# TYPE container_pressure_io_waiting_seconds_total counter +container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000 +# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion. +# TYPE container_pressure_memory_stalled_seconds_total counter +container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 +# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion. +# TYPE container_pressure_memory_waiting_seconds_total counter +container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 # HELP container_processes Number of processes running inside the container. # TYPE container_processes gauge container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 921b2e1106..8d1999815c 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -381,6 +381,24 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 +# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. +# TYPE container_pressure_cpu_waiting_seconds_total counter +container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 +# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion. +# TYPE container_pressure_io_stalled_seconds_total counter +container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000 +# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion. +# TYPE container_pressure_io_waiting_seconds_total counter +container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000 +# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion. +# TYPE container_pressure_memory_stalled_seconds_total counter +container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 +# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion. +# TYPE container_pressure_memory_waiting_seconds_total counter +container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 # HELP container_processes Number of processes running inside the container. # TYPE container_processes gauge container_processes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000