Skip to content

Commit dbdc3de

Browse files
committed
Add vgpu and mps metrics
Signed-off-by: ghokun <[email protected]>
1 parent b7fe821 commit dbdc3de

File tree

3 files changed

+56
-42
lines changed

3 files changed

+56
-42
lines changed

manifests/device-plugin.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ spec:
4545
capabilities:
4646
add: ["SYS_ADMIN"]
4747
containers:
48-
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.6.2
48+
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.6.3
4949
name: kuartis-virtual-gpu-device-plugin-ctr
5050
command:
5151
- /usr/bin/virtual-gpu-device-plugin

pkg/gpu/nvidia/metrics.go

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -29,29 +29,33 @@ var node = os.Getenv("NODE_NAME")
2929
var metricsFormat = `# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
3030
# TYPE gpu_memory_usage_per_container gauge
3131
{{- range $m := . }}
32-
gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}"} {{ $m.UsedGpuMemory }}
32+
gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}",vgpucount="{{ $m.VGpuCount }}",mpsactivethread="{{ $m.MpsActiveThread }}"} {{ $m.UsedGpuMemory }}
3333
{{- end -}}`
3434

3535
type metric struct {
36-
Pid uint32
37-
UsedGpuMemory uint64
38-
GpuIndex int
39-
GpuUUID string
40-
Node string
41-
Namespace string
42-
Pod string
43-
PodUid string
44-
Container string
45-
ContainerId string
36+
Pid uint32
37+
UsedGpuMemory uint64
38+
GpuIndex int
39+
GpuUUID string
40+
Node string
41+
Namespace string
42+
Pod string
43+
PodUid string
44+
Container string
45+
ContainerId string
46+
VGpuCount string
47+
MpsActiveThread string
4648
}
4749

4850
type containerInfo struct {
49-
Node string
50-
Namespace string
51-
Pod string
52-
PodUid string
53-
Container string
54-
ContainerId string
51+
Node string
52+
Namespace string
53+
Pod string
54+
PodUid string
55+
Container string
56+
ContainerId string
57+
VGpuCount string
58+
MpsActiveThread string
5559
}
5660

5761
func MetricServer() {
@@ -77,12 +81,14 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
7781
containerMap := make(map[string]containerInfo)
7882
for _, container := range containers.GetContainers() {
7983
containerMap[container.GetId()] = containerInfo{
80-
Node: node,
81-
Namespace: container.GetLabels()["io.kubernetes.pod.namespace"],
82-
Pod: container.GetLabels()["io.kubernetes.pod.name"],
83-
PodUid: container.GetLabels()["io.kubernetes.pod.uid"],
84-
Container: container.GetMetadata().GetName(),
85-
ContainerId: container.GetId(),
84+
Node: node,
85+
Namespace: container.GetLabels()["io.kubernetes.pod.namespace"],
86+
Pod: container.GetLabels()["io.kubernetes.pod.name"],
87+
PodUid: container.GetLabels()["io.kubernetes.pod.uid"],
88+
Container: container.GetMetadata().GetName(),
89+
ContainerId: container.GetId(),
90+
VGpuCount: container.GetAnnotations()["k8s.kuartis.com/vgpu-count"],
91+
MpsActiveThread: container.GetAnnotations()["k8s.kuartis.com/mps-active-thread"],
8692
}
8793
}
8894
collected := []metric{}
@@ -97,16 +103,18 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
97103
if container, ok := containerMap[containerId]; ok {
98104
log.Printf("Using %s Found container %+v for process: %d", containerId, container, process.Pid)
99105
collected = append(collected, metric{
100-
Pid: process.Pid,
101-
UsedGpuMemory: process.UsedGpuMemory,
102-
GpuIndex: i,
103-
GpuUUID: getDeviceUUID(d),
104-
Node: container.Node,
105-
Namespace: container.Namespace,
106-
Pod: container.Pod,
107-
PodUid: container.PodUid,
108-
Container: container.Container,
109-
ContainerId: container.ContainerId,
106+
Pid: process.Pid,
107+
UsedGpuMemory: process.UsedGpuMemory,
108+
GpuIndex: i,
109+
GpuUUID: getDeviceUUID(d),
110+
Node: container.Node,
111+
Namespace: container.Namespace,
112+
Pod: container.Pod,
113+
PodUid: container.PodUid,
114+
Container: container.Container,
115+
ContainerId: container.ContainerId,
116+
VGpuCount: container.VGpuCount,
117+
MpsActiveThread: container.MpsActiveThread,
110118
})
111119
}
112120
}

pkg/gpu/nvidia/server.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -290,17 +290,23 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Alloc
290290

291291
cresp := new(pluginapi.ContainerAllocateResponse)
292292

293+
cudaActiveThread := fmt.Sprintf("%d", 100*len(req.DevicesIDs)/(len(m.devs)/len(m.physicalDevs)*len(visibleDevs)))
294+
visibleDevsStr := strings.Join(visibleDevs, ",")
295+
allocatedDeviceIdsStr := strings.Join(req.DevicesIDs, ",")
296+
293297
cresp.Envs = map[string]string{}
294-
cresp.Envs["NVIDIA_VISIBLE_DEVICES"] = strings.Join(visibleDevs, ",")
295-
cresp.Envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = fmt.Sprintf("%d", 100*len(req.DevicesIDs)/len(m.devs))
298+
cresp.Envs["NVIDIA_VISIBLE_DEVICES"] = visibleDevsStr
299+
cresp.Envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = cudaActiveThread
296300

297301
cresp.Annotations = map[string]string{}
298-
cresp.Annotations["k8s.kuartis.com/gpu-ids"] = strings.Join(visibleDevs, ",")
299-
cresp.Annotations["k8s.kuartis.com/vgpu-ids"] = strings.Join(req.DevicesIDs, ",")
300-
301-
log.Printf("Allocated physical devices: %s", strings.Join(visibleDevs, ","))
302-
log.Printf("Allocated virtual devices: %s", strings.Join(req.DevicesIDs, ","))
303-
log.Printf("Allocated MPS ACTIVE THREAD PERCENTAGE: %s", fmt.Sprintf("%d", 100*len(req.DevicesIDs)/len(m.devs)))
302+
cresp.Annotations["k8s.kuartis.com/gpu-ids"] = visibleDevsStr
303+
cresp.Annotations["k8s.kuartis.com/vgpu-ids"] = allocatedDeviceIdsStr
304+
cresp.Annotations["k8s.kuartis.com/vgpu-count"] = fmt.Sprintf("%d", len(req.DevicesIDs))
305+
cresp.Annotations["k8s.kuartis.com/mps-active-thread"] = cudaActiveThread
306+
307+
log.Printf("Allocated physical devices: %s", visibleDevsStr)
308+
log.Printf("Allocated virtual devices: %s", allocatedDeviceIdsStr)
309+
log.Printf("Allocated MPS ACTIVE THREAD PERCENTAGE: %s", cudaActiveThread)
304310

305311
response.ContainerResponses = append(response.ContainerResponses, cresp)
306312
}

0 commit comments

Comments
 (0)