Skip to content

Commit f939e6e

Browse files
committed
Add service and service monitor manifests
Signed-off-by: ghokun <[email protected]>
1 parent dd92a41 commit f939e6e

File tree

2 files changed

+43
-9
lines changed

2 files changed

+43
-9
lines changed

manifests/device-plugin.yml

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ metadata:
2121
spec:
2222
selector:
2323
matchLabels:
24-
name: kuartis-virtual-gpu-device-plugin
24+
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
2525
updateStrategy:
2626
type: RollingUpdate
2727
template:
2828
metadata:
2929
labels:
30-
name: kuartis-virtual-gpu-device-plugin
30+
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
3131
spec:
3232
hostIPC: true
3333
nodeSelector:
@@ -45,7 +45,7 @@ spec:
4545
capabilities:
4646
add: ["SYS_ADMIN"]
4747
containers:
48-
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.10
48+
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.5.0
4949
name: kuartis-virtual-gpu-device-plugin-ctr
5050
command:
5151
- /usr/bin/virtual-gpu-device-plugin
@@ -89,3 +89,40 @@ spec:
8989
- name: dockershimsock
9090
hostPath:
9191
path: /var/run/dockershim.sock
92+
---
93+
apiVersion: v1
94+
kind: Service
95+
metadata:
96+
labels:
97+
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
98+
name: kuartis-virtual-gpu-device-plugin
99+
namespace: kube-system
100+
spec:
101+
ports:
102+
- name: metrics
103+
port: 8080
104+
targetPort: 8080
105+
protocol: TCP
106+
selector:
107+
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
108+
sessionAffinity: None
109+
type: ClusterIP
110+
---
111+
apiVersion: monitoring.coreos.com/v1
112+
kind: ServiceMonitor
113+
metadata:
114+
labels:
115+
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
116+
name: kuartis-virtual-gpu-device-plugin
117+
namespace: kube-system
118+
spec:
119+
endpoints:
120+
- interval: 15s
121+
path: /metrics
122+
port: metrics
123+
namespaceSelector:
124+
matchNames:
125+
- kube-system
126+
selector:
127+
matchLabels:
128+
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin

pkg/gpu/nvidia/metrics.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ const (
2626

2727
var node = os.Getenv("NODE_NAME")
2828

29-
var metricsFormat = `
30-
# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
29+
var metricsFormat = `# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
3130
# TYPE gpu_memory_usage_per_container gauge
3231
{{- range $m := . }}
3332
gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}"} {{ $m.UsedGpuMemory }}
@@ -86,7 +85,6 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
8685
ContainerId: container.GetId(),
8786
}
8887
}
89-
log.Printf("Current map %+v", containerMap)
9088
collected := []metric{}
9189
for i := 0; i < getDeviceCount(); i++ {
9290
d, ret := nvml.DeviceGetHandleByIndex(i)
@@ -96,7 +94,7 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
9694
log.Printf("Found %d processes on GPU %d", len(processes), i)
9795
for _, process := range processes {
9896
containerId := getContainerId(process.Pid)
99-
if container, ok := containerMap[strings.TrimSpace(containerId)]; ok {
97+
if container, ok := containerMap[containerId]; ok {
10098
log.Printf("Using %s Found container %+v for process: %d", containerId, container, process.Pid)
10199
collected = append(collected, metric{
102100
Pid: process.Pid,
@@ -145,6 +143,5 @@ func getContainerId(pid uint32) string {
145143
}
146144
proc := string(data)
147145
containerId := proc[strings.LastIndex(proc, "/")+1:]
148-
log.Printf("Found container id %s for process: %d", containerId, pid)
149-
return containerId
146+
return strings.TrimSpace(containerId)
150147
}

0 commit comments

Comments
 (0)