Skip to content

Commit fbee204

Browse files
committed
Add log for process id x container relation
Signed-off-by: ghokun <[email protected]>
1 parent 2d13d4c commit fbee204

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

manifests/device-plugin.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ spec:
4545
capabilities:
4646
add: ["SYS_ADMIN"]
4747
containers:
48-
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.4
48+
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.5
4949
name: kuartis-virtual-gpu-device-plugin-ctr
5050
command:
5151
- /usr/bin/virtual-gpu-device-plugin

pkg/gpu/nvidia/metrics.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
7474
log.Println("Error getting containers:", err)
7575
return
7676
}
77+
log.Printf("Found %d containers", len(containers.Containers))
7778
containerMap := make(map[string]containerInfo)
7879
for _, container := range containers.GetContainers() {
7980
containerMap[container.Id] = containerInfo{
@@ -91,6 +92,7 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
9192
check(ret)
9293
processes, ret := nvml.DeviceGetMPSComputeRunningProcesses(d)
9394
check(ret)
95+
log.Printf("Found %d processes on GPU %d", len(processes), i)
9496
for _, process := range processes {
9597
containerId := getContainerId(process.Pid)
9698
container := containerMap[containerId]
@@ -139,5 +141,7 @@ func getContainerId(pid uint32) string {
139141
log.Printf("Error reading proc file %s for process: %d, error: %s", file, pid, err)
140142
}
141143
proc := string(data)
142-
return proc[strings.LastIndex(proc, "/")+1:]
144+
containerId := proc[strings.LastIndex(proc, "/")+1:]
145+
log.Printf("Found container id %s for process: %d", containerId, pid)
146+
return containerId
143147
}

0 commit comments

Comments
 (0)