From 20e5af2c49c44851bdb0cb8856ac765017a76cf0 Mon Sep 17 00:00:00 2001 From: Felix Ehrenpfort Date: Mon, 27 Jan 2025 21:51:19 +0100 Subject: [PATCH] Expose PSI metric for CPU full Signed-off-by: Felix Ehrenpfort --- metrics/prometheus.go | 7 +++++++ metrics/testdata/prometheus_metrics | 3 +++ metrics/testdata/prometheus_metrics_whitelist_filtered | 3 +++ 3 files changed, 13 insertions(+) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index cf93018fea..aa6d53ceeb 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1754,6 +1754,13 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri if includedMetrics.Has(container.PressureMetrics) { c.containerMetrics = append(c.containerMetrics, []containerMetric{ { + name: "container_pressure_cpu_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to CPU congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Full.Total), timestamp: s.Timestamp}} + }, + }, { name: "container_pressure_cpu_waiting_seconds_total", help: "Total time duration tasks in the container have waited due to CPU congestion.", valueType: prometheus.CounterValue, diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index 354e4109e7..b0dc5c444b 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -381,6 +381,9 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_ # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 # HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. # TYPE container_pressure_cpu_waiting_seconds_total counter container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 7c12bd4b72..8d1999815c 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -381,6 +381,9 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 # HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. # TYPE container_pressure_cpu_waiting_seconds_total counter container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000