Skip to content

Commit 3bb6eee

Browse files
author
Kubernetes Submit Queue
authored
Merge pull request kubernetes#55340 from jiayingz/metrics
Automatic merge from submit-queue (batch tested with PRs 55340, 55329, 56168, 56170, 56105). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Adds device plugin allocation latency metric. For kubernetes#53497 **What this PR does / why we need it**: **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # **Special notes for your reviewer**: **Release note**: ```release-note ```
2 parents 277d866 + 048bafd commit 3bb6eee

File tree

5 files changed

+61
-0
lines changed

5 files changed

+61
-0
lines changed

pkg/kubelet/cm/deviceplugin/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ go_library(
2323
"//pkg/kubelet/config:go_default_library",
2424
"//pkg/kubelet/container:go_default_library",
2525
"//pkg/kubelet/lifecycle:go_default_library",
26+
"//pkg/kubelet/metrics:go_default_library",
2627
"//plugin/pkg/scheduler/schedulercache:go_default_library",
2728
"//vendor/github.com/golang/glog:go_default_library",
2829
"//vendor/golang.org/x/net/context:go_default_library",

pkg/kubelet/cm/deviceplugin/manager.go

+5
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"os"
2525
"path/filepath"
2626
"sync"
27+
"time"
2728

2829
"github.com/golang/glog"
2930
"golang.org/x/net/context"
@@ -36,6 +37,7 @@ import (
3637
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
3738
"k8s.io/kubernetes/pkg/kubelet/config"
3839
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
40+
"k8s.io/kubernetes/pkg/kubelet/metrics"
3941
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
4042
)
4143

@@ -265,6 +267,7 @@ func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.P
265267
// Register registers a device plugin.
266268
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
267269
glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
270+
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
268271
if r.Version != pluginapi.Version {
269272
errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version)
270273
glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString)
@@ -548,6 +551,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
548551
if allocDevices == nil || len(allocDevices) <= 0 {
549552
continue
550553
}
554+
startRPCTime := time.Now()
551555
// devicePluginManager.Allocate involves RPC calls to device plugin, which
552556
// could be heavy-weight. Therefore we want to perform this operation outside
553557
// mutex lock. Note if Allocate call fails, we may leave container resources
@@ -573,6 +577,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
573577
devs := allocDevices.UnsortedList()
574578
glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
575579
resp, err := e.allocate(devs)
580+
metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
576581
if err != nil {
577582
// In case of allocation failure, we want to restore m.allocatedDevices
578583
// to the actual allocated state from m.podDevices.

pkg/kubelet/metrics/metrics.go

+21
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ const (
4444
RuntimeOperationsKey = "runtime_operations"
4545
RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds"
4646
RuntimeOperationsErrorsKey = "runtime_operations_errors"
47+
// Metrics keys of device plugin operations
48+
DevicePluginRegistrationCountKey = "device_plugin_registration_count"
49+
DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds"
4750
)
4851

4952
var (
@@ -179,6 +182,22 @@ var (
179182
},
180183
[]string{"namespace", "persistentvolumeclaim"},
181184
)
185+
DevicePluginRegistrationCount = prometheus.NewCounterVec(
186+
prometheus.CounterOpts{
187+
Subsystem: KubeletSubsystem,
188+
Name: DevicePluginRegistrationCountKey,
189+
Help: "Cumulative number of device plugin registrations. Broken down by resource name.",
190+
},
191+
[]string{"resource_name"},
192+
)
193+
DevicePluginAllocationLatency = prometheus.NewSummaryVec(
194+
prometheus.SummaryOpts{
195+
Subsystem: KubeletSubsystem,
196+
Name: DevicePluginAllocationLatencyKey,
197+
Help: "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.",
198+
},
199+
[]string{"resource_name"},
200+
)
182201
)
183202

184203
var registerMetrics sync.Once
@@ -205,6 +224,8 @@ func Register(containerCache kubecontainer.RuntimeCache) {
205224
prometheus.MustRegister(VolumeStatsInodes)
206225
prometheus.MustRegister(VolumeStatsInodesFree)
207226
prometheus.MustRegister(VolumeStatsInodesUsed)
227+
prometheus.MustRegister(DevicePluginRegistrationCount)
228+
prometheus.MustRegister(DevicePluginAllocationLatency)
208229
})
209230
}
210231

test/e2e_node/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ go_library(
4747
"//vendor/github.com/golang/glog:go_default_library",
4848
"//vendor/github.com/onsi/ginkgo:go_default_library",
4949
"//vendor/github.com/onsi/gomega:go_default_library",
50+
"//vendor/github.com/prometheus/common/model:go_default_library",
5051
"//vendor/k8s.io/api/core/v1:go_default_library",
5152
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
5253
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",

test/e2e_node/gpu_device_plugin.go

+33
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package e2e_node
1919
import (
2020
"os/exec"
2121
"regexp"
22+
"strconv"
2223
"time"
2324

2425
"k8s.io/api/core/v1"
@@ -27,10 +28,13 @@ import (
2728
"k8s.io/apimachinery/pkg/util/uuid"
2829
"k8s.io/kubernetes/pkg/features"
2930
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
31+
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
3032
"k8s.io/kubernetes/test/e2e/framework"
33+
"k8s.io/kubernetes/test/e2e/framework/metrics"
3134

3235
. "github.com/onsi/ginkgo"
3336
. "github.com/onsi/gomega"
37+
"github.com/prometheus/common/model"
3438
)
3539

3640
const (
@@ -121,6 +125,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
121125
Expect(devIdRestart1).To(Equal(devId1))
122126
count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
123127
Expect(devIdRestart2).To(Equal(devId2))
128+
logDevicePluginMetrics()
124129

125130
// Cleanup
126131
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
@@ -129,6 +134,34 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
129134
})
130135
})
131136

137+
func logDevicePluginMetrics() {
138+
ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
139+
framework.ExpectNoError(err)
140+
for msKey, samples := range ms {
141+
switch msKey {
142+
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationLatencyKey:
143+
for _, sample := range samples {
144+
latency := sample.Value
145+
resource := string(sample.Metric["resource_name"])
146+
var quantile float64
147+
if val, ok := sample.Metric[model.QuantileLabel]; ok {
148+
var err error
149+
if quantile, err = strconv.ParseFloat(string(val), 64); err != nil {
150+
continue
151+
}
152+
framework.Logf("Metric: %v ResourceName: %v Quantile: %v Latency: %v", msKey, resource, quantile, latency)
153+
}
154+
}
155+
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginRegistrationCountKey:
156+
for _, sample := range samples {
157+
resource := string(sample.Metric["resource_name"])
158+
count := sample.Value
159+
framework.Logf("Metric: %v ResourceName: %v Count: %v", msKey, resource, count)
160+
}
161+
}
162+
}
163+
}
164+
132165
func makeCudaPauseImage() *v1.Pod {
133166
podName := testPodNamePrefix + string(uuid.NewUUID())
134167

0 commit comments

Comments
 (0)