Skip to content

Commit d876e6f

Browse files
committed
Add vgpu metrics
Signed-off-by: ghokun <[email protected]>
1 parent db96b90 commit d876e6f

File tree

7 files changed

+355
-72
lines changed

7 files changed

+355
-72
lines changed

go.mod

+12-10
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,21 @@ module github.com/kuartis/kuartis-virtual-gpu-device-plugin
33
go 1.17
44

55
require (
6-
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20191011002627-7a750c7e4f8b
7-
github.com/fsnotify/fsnotify v1.4.7
8-
golang.org/x/net v0.0.0-20190812203447-cdfb69ac37fc
9-
google.golang.org/grpc v1.24.0
6+
github.com/NVIDIA/go-nvml v0.11.6-0
7+
github.com/fsnotify/fsnotify v1.5.1
8+
golang.org/x/net v0.0.0-20220225172249-27dd8689420f
9+
google.golang.org/grpc v1.45.0
10+
k8s.io/cri-api v0.20.13
1011
k8s.io/kubernetes v1.16.0
1112
)
1213

1314
require (
14-
github.com/gogo/protobuf v1.3.0 // indirect
15-
github.com/golang/protobuf v1.3.2 // indirect
16-
golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f // indirect
17-
golang.org/x/text v0.3.2 // indirect
18-
google.golang.org/genproto v0.0.0-20190926190326-7ee9db18f195 // indirect
15+
github.com/gogo/protobuf v1.3.2 // indirect
16+
github.com/golang/protobuf v1.5.2 // indirect
17+
golang.org/x/sys v0.0.0-20220318055525-2edf467146b5 // indirect
18+
golang.org/x/text v0.3.7 // indirect
19+
google.golang.org/genproto v0.0.0-20220317150908-0efb43f6373e // indirect
20+
google.golang.org/protobuf v1.27.1 // indirect
1921
)
2022

2123
replace (
@@ -29,7 +31,7 @@ replace (
2931
k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.0.0-20190819145008-029dd04813af
3032
k8s.io/code-generator => k8s.io/code-generator v0.0.0-20190612205613-18da4a14b22b
3133
k8s.io/component-base => k8s.io/component-base v0.0.0-20190819141909-f0f7c184477d
32-
k8s.io/cri-api => k8s.io/cri-api v0.0.0-20190817025403-3ae76f584e79
34+
k8s.io/cri-api => k8s.io/cri-api v0.20.13
3335
k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.0.0-20190819145328-4831a4ced492
3436
k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.0.0-20190819142756-13daafd3604f
3537
k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.0.0-20190819144832-f53437941eef

go.sum

+125-15
Large diffs are not rendered by default.

manifests/device-plugin.yml

+12-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ spec:
4545
capabilities:
4646
add: ["SYS_ADMIN"]
4747
containers:
48-
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.3.3
48+
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.0
4949
name: kuartis-virtual-gpu-device-plugin-ctr
5050
command:
5151
- /usr/bin/virtual-gpu-device-plugin
@@ -58,6 +58,11 @@ spec:
5858
volumeMounts:
5959
- name: device-plugin
6060
mountPath: /var/lib/kubelet/device-plugins
61+
- name: proc
62+
mountPath: /host/proc
63+
readOnly: true
64+
- mountPath: /var/run/containerd/containerd.sock
65+
name: containerdsock
6166
- image: nvidia/mps
6267
name: mps
6368
volumeMounts:
@@ -70,3 +75,9 @@ spec:
7075
- name: nvidia-mps
7176
hostPath:
7277
path: /tmp/nvidia-mps
78+
- name: proc
79+
hostPath:
80+
path: /proc
81+
- name: containerdsock
82+
hostPath:
83+
path: /var/run/containerd/containerd.sock

pkg/gpu/nvidia/metrics.go

+141
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
package nvidia
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"fmt"
7+
"log"
8+
"net"
9+
"net/http"
10+
"os"
11+
"strings"
12+
"text/template"
13+
"time"
14+
15+
"github.com/NVIDIA/go-nvml/pkg/nvml"
16+
17+
"google.golang.org/grpc"
18+
"google.golang.org/grpc/credentials/insecure"
19+
pb "k8s.io/cri-api/pkg/apis/runtime/v1"
20+
)
21+
22+
const (
23+
containerdsocket = "/var/run/containerd/containerd.sock"
24+
timeout = 10 * time.Second
25+
)
26+
27+
var metricsFormat = `
28+
# HELP container_per_gpu
29+
# TYPE container_per_gpu gauge
30+
{{- range $m := . }}
31+
container_per_gpu{pid="{{ $m.Pid }}",usedgpumemory="{{ $m.UsedGpuMemory }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }},node="{{ $m.Node }},namespace="{{ $m.Namespace }},pod="{{ $m.Pod }},poduid="{{ $m.PodUid }},container="{{ $m.Container }},containerid="{{ $m.ContainerId }}"} 1
32+
{{- end -}}`
33+
34+
type metric struct {
35+
Pid uint32
36+
UsedGpuMemory uint64
37+
GpuIndex int
38+
GpuUUID string
39+
Node string
40+
Namespace string
41+
Pod string
42+
PodUid string
43+
Container string
44+
ContainerId string
45+
}
46+
47+
type containerInfo struct {
48+
Node string
49+
Namespace string
50+
Pod string
51+
PodUid string
52+
Container string
53+
ContainerId string
54+
}
55+
56+
func MetricServer() {
57+
http.HandleFunc("/metrics", collectMetrics)
58+
http.ListenAndServe(":8080", nil)
59+
}
60+
61+
func collectMetrics(w http.ResponseWriter, r *http.Request) {
62+
runtimeClient, runtimeConn, err := getRuntimeClient()
63+
if err != nil {
64+
log.Println("Error getting runtime client:", err)
65+
return
66+
}
67+
if runtimeConn != nil {
68+
defer runtimeConn.Close()
69+
}
70+
containers, err := runtimeClient.ListContainers(context.Background(), &pb.ListContainersRequest{})
71+
if err != nil {
72+
log.Println("Error getting containers:", err)
73+
return
74+
}
75+
containerMap := make(map[string]containerInfo)
76+
for _, container := range containers.GetContainers() {
77+
containerMap[container.Id] = containerInfo{
78+
Node: "",
79+
Namespace: container.Labels["io.kubernetes.pod.namespace"],
80+
Pod: container.Labels["io.kubernetes.pod.name"],
81+
PodUid: container.Labels["io.kubernetes.pod.uid"],
82+
Container: container.Metadata.Name,
83+
ContainerId: container.Id,
84+
}
85+
}
86+
collected := []metric{}
87+
for i := 0; i < getDeviceCount(); i++ {
88+
d, ret := nvml.DeviceGetHandleByIndex(i)
89+
check(ret)
90+
processes, ret := nvml.DeviceGetMPSComputeRunningProcesses(d)
91+
check(ret)
92+
for _, process := range processes {
93+
containerId := getContainerId(process.Pid)
94+
container := containerMap[containerId]
95+
collected = append(collected, metric{
96+
Pid: process.Pid,
97+
UsedGpuMemory: process.UsedGpuMemory,
98+
GpuIndex: i,
99+
GpuUUID: getDeviceUUID(d),
100+
Node: container.Node,
101+
Namespace: container.Namespace,
102+
Pod: container.Pod,
103+
PodUid: container.PodUid,
104+
Container: container.Container,
105+
ContainerId: container.ContainerId,
106+
})
107+
}
108+
}
109+
110+
t := template.Must(template.New("metrics").Parse(metricsFormat))
111+
var res bytes.Buffer
112+
if err := t.Execute(&res, collected); err != nil {
113+
w.Write([]byte(fmt.Sprintf("Error generating metrics: %s", err)))
114+
} else {
115+
w.Write(res.Bytes())
116+
}
117+
}
118+
119+
func getRuntimeClient() (pb.RuntimeServiceClient, *grpc.ClientConn, error) {
120+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
121+
defer cancel()
122+
conn, err := grpc.DialContext(ctx, containerdsocket, grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithBlock(),
123+
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
124+
return net.DialTimeout("unix", addr, timeout)
125+
}),
126+
)
127+
if err != nil {
128+
return nil, nil, err
129+
}
130+
return pb.NewRuntimeServiceClient(conn), conn, nil
131+
}
132+
133+
func getContainerId(pid uint32) string {
134+
file := fmt.Sprintf("/host/proc/%d/cpuset", pid)
135+
data, err := os.ReadFile(file)
136+
if err != nil {
137+
log.Printf("Error reading proc file %s for process: %d, error: %s", file, pid, err)
138+
}
139+
proc := string(data)
140+
return proc[strings.LastIndex(proc, "/")+1:]
141+
}

pkg/gpu/nvidia/nvidia.go

+58-42
Original file line numberDiff line numberDiff line change
@@ -20,33 +20,38 @@ import (
2020
"log"
2121
"strings"
2222

23-
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
23+
"github.com/NVIDIA/go-nvml/pkg/nvml"
2424

2525
"golang.org/x/net/context"
2626
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
2727
)
2828

29-
func check(err error) {
30-
if err != nil {
31-
log.Panicln("Fatal:", err)
29+
func check(ret nvml.Return) bool {
30+
if ret != nvml.SUCCESS {
31+
log.Printf("Error: %s", nvml.ErrorString(ret))
32+
return false
33+
}
34+
return true
35+
}
36+
37+
func checkAndPanic(ret nvml.Return) {
38+
if ret != nvml.SUCCESS {
39+
log.Panicf("Fatal: %s", nvml.ErrorString(ret))
3240
}
3341
}
3442

3543
// Instead of returning physical GPU devices, device plugin returns vGPU devices here.
3644
// Total number of vGPU depends on the vGPU count user specify.
3745
func getVGPUDevices(vGPUCount int) []*pluginapi.Device {
38-
n, err := nvml.GetDeviceCount()
39-
check(err)
40-
4146
var devs []*pluginapi.Device
42-
for i := uint(0); i < n; i++ {
43-
d, err := nvml.NewDevice(i)
44-
check(err)
47+
for i := 0; i < getDeviceCount(); i++ {
48+
d, ret := nvml.DeviceGetHandleByIndex(i)
49+
checkAndPanic(ret)
4550

46-
log.Printf("Device Memory: %d, vGPU Count: %d", uint(*d.Memory), vGPUCount)
51+
log.Printf("Device Memory: %d, vGPU Count: %d", getDeviceMemory(d), vGPUCount)
4752

48-
for j := uint(0); j < uint(vGPUCount); j++ {
49-
vGPUDeviceID := getVGPUID(d.UUID, j)
53+
for j := 0; j < vGPUCount; j++ {
54+
vGPUDeviceID := getVGPUID(getDeviceUUID(d), j)
5055
dev := pluginapi.Device{
5156
ID: vGPUDeviceID,
5257
Health: pluginapi.Healthy,
@@ -70,27 +75,35 @@ func getVGPUDevices(vGPUCount int) []*pluginapi.Device {
7075
return devs
7176
}
7277

73-
func getDeviceCount() uint {
74-
n, err := nvml.GetDeviceCount()
75-
check(err)
78+
func getDeviceCount() int {
79+
n, ret := nvml.DeviceGetCount()
80+
checkAndPanic(ret)
7681
return n
7782
}
7883

79-
func getPhysicalGPUDevices() []string {
80-
n, err := nvml.GetDeviceCount()
81-
check(err)
84+
func getDeviceUUID(device nvml.Device) string {
85+
uuid, ret := device.GetUUID()
86+
checkAndPanic(ret)
87+
return uuid
88+
}
8289

90+
func getDeviceMemory(device nvml.Device) uint64 {
91+
mem, ret := device.GetMemoryInfo()
92+
checkAndPanic(ret)
93+
return mem.Total
94+
}
95+
96+
func getPhysicalGPUDevices() []string {
8397
var devs []string
84-
for i := uint(0); i < n; i++ {
85-
d, err := nvml.NewDevice(i)
86-
check(err)
87-
devs = append(devs, d.UUID)
98+
for i := 0; i < getDeviceCount(); i++ {
99+
d, ret := nvml.DeviceGetHandleByIndex(i)
100+
checkAndPanic(ret)
101+
devs = append(devs, getDeviceUUID(d))
88102
}
89-
90103
return devs
91104
}
92105

93-
func getVGPUID(deviceID string, vGPUIndex uint) string {
106+
func getVGPUID(deviceID string, vGPUIndex int) string {
94107
return fmt.Sprintf("%s-%d", deviceID, vGPUIndex)
95108
}
96109

@@ -118,11 +131,12 @@ func physicialDeviceExists(devs []string, id string) bool {
118131
}
119132

120133
func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
121-
eventSet := nvml.NewEventSet()
122-
defer nvml.DeleteEventSet(eventSet)
134+
eventSet, ret := nvml.EventSetCreate()
135+
checkAndPanic(ret)
136+
defer nvml.EventSetFree(eventSet)
123137
var physicalDeviceIDs []string
124138

125-
// We don't have to loop all virtual GPUs here. Only need to check physical CPUs.
139+
// We don't have to loop all virtual GPUs here. Only need to check physical GPUs.
126140
for _, d := range devs {
127141
physicalDeviceID := getPhysicalDeviceID(d.ID)
128142
if physicialDeviceExists(physicalDeviceIDs, physicalDeviceID) {
@@ -131,17 +145,16 @@ func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *plugi
131145
physicalDeviceIDs = append(physicalDeviceIDs, physicalDeviceID)
132146

133147
log.Printf("virtual id %s physical id %s", d.ID, physicalDeviceID)
134-
err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, physicalDeviceID)
135-
if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
136-
log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", physicalDeviceID, err)
137148

149+
device, ret := nvml.DeviceGetHandleByUUID(physicalDeviceID)
150+
checkAndPanic(ret)
151+
ret = nvml.DeviceRegisterEvents(device, nvml.EventTypeXidCriticalError, eventSet)
152+
if ret == nvml.ERROR_NOT_SUPPORTED {
153+
log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", physicalDeviceID, nvml.ErrorString(ret))
138154
xids <- d
139155
continue
140156
}
141-
142-
if err != nil {
143-
log.Panicln("Fatal:", err)
144-
}
157+
checkAndPanic(ret)
145158
}
146159

147160
for {
@@ -151,30 +164,33 @@ func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *plugi
151164
default:
152165
}
153166

154-
e, err := nvml.WaitForEvent(eventSet, 5000)
155-
if err != nil && e.Etype != nvml.XidCriticalError {
167+
e, ret := nvml.EventSetWait(eventSet, 5000)
168+
checkAndPanic(ret)
169+
if e.EventType != nvml.EventTypeXidCriticalError {
156170
continue
157171
}
158172

159173
// FIXME: formalize the full list and document it.
160174
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
161175
// Application errors: the GPU should still be healthy
162-
if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
176+
if e.EventData == 31 || e.EventData == 43 || e.EventData == 45 {
163177
continue
164178
}
165179

166-
if e.UUID == nil || len(*e.UUID) == 0 {
180+
uuid, ret := e.Device.GetUUID()
181+
checkAndPanic(ret)
182+
if len(uuid) == 0 {
167183
// All devices are unhealthy
168184
for _, d := range devs {
169-
log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata)
185+
log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.EventData)
170186
xids <- d
171187
}
172188
continue
173189
}
174190

175191
for _, d := range devs {
176-
if d.ID == *e.UUID {
177-
log.Printf("XidCriticalError: Xid=%d on GPU=%s, the device will go unhealthy.", e.Edata, d.ID)
192+
if d.ID == uuid {
193+
log.Printf("XidCriticalError: Xid=%d on GPU=%s, the device will go unhealthy.", e.EventData, d.ID)
178194
xids <- d
179195
}
180196
}

0 commit comments

Comments
 (0)