kuartis
diff --git a/‎go.mod
Lines changed: 12 additions & 10 deletions b/‎go.mod
Lines changed: 12 additions & 10 deletions
diff --git a/‎go.sum
Lines changed: 125 additions & 15 deletions b/‎go.sum
Lines changed: 125 additions & 15 deletions
diff --git a/‎manifests/device-plugin.yml
Lines changed: 12 additions & 1 deletion b/‎manifests/device-plugin.yml
Lines changed: 12 additions & 1 deletion
diff --git a/‎pkg/gpu/nvidia/metrics.go
Lines changed: 141 additions & 0 deletions b/‎pkg/gpu/nvidia/metrics.go
Lines changed: 141 additions & 0 deletions
diff --git a/‎pkg/gpu/nvidia/nvidia.go
Lines changed: 58 additions & 42 deletions b/‎pkg/gpu/nvidia/nvidia.go
Lines changed: 58 additions & 42 deletions
@@ -3,19 +3,21 @@ module github.com/kuartis/kuartis-virtual-gpu-device-plugin
 go 1.17
 
 require (
-	github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20191011002627-7a750c7e4f8b
-	github.com/fsnotify/fsnotify v1.4.7
-	golang.org/x/net v0.0.0-20190812203447-cdfb69ac37fc
-	google.golang.org/grpc v1.24.0
+	github.com/NVIDIA/go-nvml v0.11.6-0
+	github.com/fsnotify/fsnotify v1.5.1
+	golang.org/x/net v0.0.0-20220225172249-27dd8689420f
+	google.golang.org/grpc v1.45.0
+	k8s.io/cri-api v0.20.13
 	k8s.io/kubernetes v1.16.0
 )
 
 require (
-	github.com/gogo/protobuf v1.3.0 // indirect
-	github.com/golang/protobuf v1.3.2 // indirect
-	golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f // indirect
-	golang.org/x/text v0.3.2 // indirect
-	google.golang.org/genproto v0.0.0-20190926190326-7ee9db18f195 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/protobuf v1.5.2 // indirect
+	golang.org/x/sys v0.0.0-20220318055525-2edf467146b5 // indirect
+	golang.org/x/text v0.3.7 // indirect
+	google.golang.org/genproto v0.0.0-20220317150908-0efb43f6373e // indirect
+	google.golang.org/protobuf v1.27.1 // indirect
 )
 
 replace (
@@ -29,7 +31,7 @@ replace (
 	k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.0.0-20190819145008-029dd04813af
 	k8s.io/code-generator => k8s.io/code-generator v0.0.0-20190612205613-18da4a14b22b
 	k8s.io/component-base => k8s.io/component-base v0.0.0-20190819141909-f0f7c184477d
-	k8s.io/cri-api => k8s.io/cri-api v0.0.0-20190817025403-3ae76f584e79
+	k8s.io/cri-api => k8s.io/cri-api v0.20.13
 	k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.0.0-20190819145328-4831a4ced492
 	k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.0.0-20190819142756-13daafd3604f
 	k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.0.0-20190819144832-f53437941eef
 
@@ -45,7 +45,7 @@ spec:
             capabilities:
               add: ["SYS_ADMIN"]
       containers:
-        - image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.3.3
+        - image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.0
           name: kuartis-virtual-gpu-device-plugin-ctr
           command:
             - /usr/bin/virtual-gpu-device-plugin
@@ -58,6 +58,11 @@ spec:
           volumeMounts:
             - name: device-plugin
               mountPath: /var/lib/kubelet/device-plugins
+            - name: proc
+              mountPath: /host/proc
+              readOnly: true
+            - mountPath: /var/run/containerd/containerd.sock
+              name: containerdsock
         - image: nvidia/mps
           name: mps
           volumeMounts:
@@ -70,3 +75,9 @@ spec:
         - name: nvidia-mps
           hostPath:
             path: /tmp/nvidia-mps
+        - name: proc
+          hostPath:
+            path: /proc
+        - name: containerdsock
+          hostPath:
+            path: /var/run/containerd/containerd.sock
@@ -0,0 +1,141 @@
+package nvidia
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"log"
+	"net"
+	"net/http"
+	"os"
+	"strings"
+	"text/template"
+	"time"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+	pb "k8s.io/cri-api/pkg/apis/runtime/v1"
+)
+
+const (
+	containerdsocket = "/var/run/containerd/containerd.sock"
+	timeout          = 10 * time.Second
+)
+
+var metricsFormat = `
+# HELP container_per_gpu
+# TYPE container_per_gpu gauge
+{{- range $m := . }}
+container_per_gpu{pid="{{ $m.Pid }}",usedgpumemory="{{ $m.UsedGpuMemory }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }},node="{{ $m.Node }},namespace="{{ $m.Namespace }},pod="{{ $m.Pod }},poduid="{{ $m.PodUid }},container="{{ $m.Container }},containerid="{{ $m.ContainerId }}"} 1
+{{- end -}}`
+
+type metric struct {
+	Pid           uint32
+	UsedGpuMemory uint64
+	GpuIndex      int
+	GpuUUID       string
+	Node          string
+	Namespace     string
+	Pod           string
+	PodUid        string
+	Container     string
+	ContainerId   string
+}
+
+type containerInfo struct {
+	Node        string
+	Namespace   string
+	Pod         string
+	PodUid      string
+	Container   string
+	ContainerId string
+}
+
+func MetricServer() {
+	http.HandleFunc("/metrics", collectMetrics)
+	http.ListenAndServe(":8080", nil)
+}
+
+func collectMetrics(w http.ResponseWriter, r *http.Request) {
+	runtimeClient, runtimeConn, err := getRuntimeClient()
+	if err != nil {
+		log.Println("Error getting runtime client:", err)
+		return
+	}
+	if runtimeConn != nil {
+		defer runtimeConn.Close()
+	}
+	containers, err := runtimeClient.ListContainers(context.Background(), &pb.ListContainersRequest{})
+	if err != nil {
+		log.Println("Error getting containers:", err)
+		return
+	}
+	containerMap := make(map[string]containerInfo)
+	for _, container := range containers.GetContainers() {
+		containerMap[container.Id] = containerInfo{
+			Node:        "",
+			Namespace:   container.Labels["io.kubernetes.pod.namespace"],
+			Pod:         container.Labels["io.kubernetes.pod.name"],
+			PodUid:      container.Labels["io.kubernetes.pod.uid"],
+			Container:   container.Metadata.Name,
+			ContainerId: container.Id,
+		}
+	}
+	collected := []metric{}
+	for i := 0; i < getDeviceCount(); i++ {
+		d, ret := nvml.DeviceGetHandleByIndex(i)
+		check(ret)
+		processes, ret := nvml.DeviceGetMPSComputeRunningProcesses(d)
+		check(ret)
+		for _, process := range processes {
+			containerId := getContainerId(process.Pid)
+			container := containerMap[containerId]
+			collected = append(collected, metric{
+				Pid:           process.Pid,
+				UsedGpuMemory: process.UsedGpuMemory,
+				GpuIndex:      i,
+				GpuUUID:       getDeviceUUID(d),
+				Node:          container.Node,
+				Namespace:     container.Namespace,
+				Pod:           container.Pod,
+				PodUid:        container.PodUid,
+				Container:     container.Container,
+				ContainerId:   container.ContainerId,
+			})
+		}
+	}
+
+	t := template.Must(template.New("metrics").Parse(metricsFormat))
+	var res bytes.Buffer
+	if err := t.Execute(&res, collected); err != nil {
+		w.Write([]byte(fmt.Sprintf("Error generating metrics: %s", err)))
+	} else {
+		w.Write(res.Bytes())
+	}
+}
+
+func getRuntimeClient() (pb.RuntimeServiceClient, *grpc.ClientConn, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	conn, err := grpc.DialContext(ctx, containerdsocket, grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithBlock(),
+		grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
+			return net.DialTimeout("unix", addr, timeout)
+		}),
+	)
+	if err != nil {
+		return nil, nil, err
+	}
+	return pb.NewRuntimeServiceClient(conn), conn, nil
+}
+
+func getContainerId(pid uint32) string {
+	file := fmt.Sprintf("/host/proc/%d/cpuset", pid)
+	data, err := os.ReadFile(file)
+	if err != nil {
+		log.Printf("Error reading proc file %s for process: %d, error: %s", file, pid, err)
+	}
+	proc := string(data)
+	return proc[strings.LastIndex(proc, "/")+1:]
+}
@@ -20,33 +20,38 @@ import (
 	"log"
 	"strings"
 
-	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
 
 	"golang.org/x/net/context"
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
 )
 
-func check(err error) {
-	if err != nil {
-		log.Panicln("Fatal:", err)
+func check(ret nvml.Return) bool {
+	if ret != nvml.SUCCESS {
+		log.Printf("Error: %s", nvml.ErrorString(ret))
+		return false
+	}
+	return true
+}
+
+func checkAndPanic(ret nvml.Return) {
+	if ret != nvml.SUCCESS {
+		log.Panicf("Fatal: %s", nvml.ErrorString(ret))
 	}
 }
 
 // Instead of returning physical GPU devices, device plugin returns vGPU devices here.
 // Total number of vGPU depends on the vGPU count user specify.
 func getVGPUDevices(vGPUCount int) []*pluginapi.Device {
-	n, err := nvml.GetDeviceCount()
-	check(err)
-
 	var devs []*pluginapi.Device
-	for i := uint(0); i < n; i++ {
-		d, err := nvml.NewDevice(i)
-		check(err)
+	for i := 0; i < getDeviceCount(); i++ {
+		d, ret := nvml.DeviceGetHandleByIndex(i)
+		checkAndPanic(ret)
 
-		log.Printf("Device Memory: %d, vGPU Count: %d", uint(*d.Memory), vGPUCount)
+		log.Printf("Device Memory: %d, vGPU Count: %d", getDeviceMemory(d), vGPUCount)
 
-		for j := uint(0); j < uint(vGPUCount); j++ {
-			vGPUDeviceID := getVGPUID(d.UUID, j)
+		for j := 0; j < vGPUCount; j++ {
+			vGPUDeviceID := getVGPUID(getDeviceUUID(d), j)
 			dev := pluginapi.Device{
 				ID:     vGPUDeviceID,
 				Health: pluginapi.Healthy,
@@ -70,27 +75,35 @@ func getVGPUDevices(vGPUCount int) []*pluginapi.Device {
 	return devs
 }
 
-func getDeviceCount() uint {
-	n, err := nvml.GetDeviceCount()
-	check(err)
+func getDeviceCount() int {
+	n, ret := nvml.DeviceGetCount()
+	checkAndPanic(ret)
 	return n
 }
 
-func getPhysicalGPUDevices() []string {
-	n, err := nvml.GetDeviceCount()
-	check(err)
+func getDeviceUUID(device nvml.Device) string {
+	uuid, ret := device.GetUUID()
+	checkAndPanic(ret)
+	return uuid
+}
 
+func getDeviceMemory(device nvml.Device) uint64 {
+	mem, ret := device.GetMemoryInfo()
+	checkAndPanic(ret)
+	return mem.Total
+}
+
+func getPhysicalGPUDevices() []string {
 	var devs []string
-	for i := uint(0); i < n; i++ {
-		d, err := nvml.NewDevice(i)
-		check(err)
-		devs = append(devs, d.UUID)
+	for i := 0; i < getDeviceCount(); i++ {
+		d, ret := nvml.DeviceGetHandleByIndex(i)
+		checkAndPanic(ret)
+		devs = append(devs, getDeviceUUID(d))
 	}
-
 	return devs
 }
 
-func getVGPUID(deviceID string, vGPUIndex uint) string {
+func getVGPUID(deviceID string, vGPUIndex int) string {
 	return fmt.Sprintf("%s-%d", deviceID, vGPUIndex)
 }
 
@@ -118,11 +131,12 @@ func physicialDeviceExists(devs []string, id string) bool {
 }
 
 func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
-	eventSet := nvml.NewEventSet()
-	defer nvml.DeleteEventSet(eventSet)
+	eventSet, ret := nvml.EventSetCreate()
+	checkAndPanic(ret)
+	defer nvml.EventSetFree(eventSet)
 	var physicalDeviceIDs []string
 
-	// We don't have to loop all virtual GPUs here. Only need to check physical CPUs.
+	// We don't have to loop all virtual GPUs here. Only need to check physical GPUs.
 	for _, d := range devs {
 		physicalDeviceID := getPhysicalDeviceID(d.ID)
 		if physicialDeviceExists(physicalDeviceIDs, physicalDeviceID) {
@@ -131,17 +145,16 @@ func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *plugi
 		physicalDeviceIDs = append(physicalDeviceIDs, physicalDeviceID)
 
 		log.Printf("virtual id %s physical id %s", d.ID, physicalDeviceID)
-		err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, physicalDeviceID)
-		if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
-			log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", physicalDeviceID, err)
 
+		device, ret := nvml.DeviceGetHandleByUUID(physicalDeviceID)
+		checkAndPanic(ret)
+		ret = nvml.DeviceRegisterEvents(device, nvml.EventTypeXidCriticalError, eventSet)
+		if ret == nvml.ERROR_NOT_SUPPORTED {
+			log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", physicalDeviceID, nvml.ErrorString(ret))
 			xids <- d
 			continue
 		}
-
-		if err != nil {
-			log.Panicln("Fatal:", err)
-		}
+		checkAndPanic(ret)
 	}
 
 	for {
@@ -151,30 +164,33 @@ func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *plugi
 		default:
 		}
 
-		e, err := nvml.WaitForEvent(eventSet, 5000)
-		if err != nil && e.Etype != nvml.XidCriticalError {
+		e, ret := nvml.EventSetWait(eventSet, 5000)
+		checkAndPanic(ret)
+		if e.EventType != nvml.EventTypeXidCriticalError {
 			continue
 		}
 
 		// FIXME: formalize the full list and document it.
 		// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
 		// Application errors: the GPU should still be healthy
-		if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
+		if e.EventData == 31 || e.EventData == 43 || e.EventData == 45 {
 			continue
 		}
 
-		if e.UUID == nil || len(*e.UUID) == 0 {
+		uuid, ret := e.Device.GetUUID()
+		checkAndPanic(ret)
+		if len(uuid) == 0 {
 			// All devices are unhealthy
 			for _, d := range devs {
-				log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata)
+				log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.EventData)
 				xids <- d
 			}
 			continue
 		}
 
 		for _, d := range devs {
-			if d.ID == *e.UUID {
-				log.Printf("XidCriticalError: Xid=%d on GPU=%s, the device will go unhealthy.", e.Edata, d.ID)
+			if d.ID == uuid {
+				log.Printf("XidCriticalError: Xid=%d on GPU=%s, the device will go unhealthy.", e.EventData, d.ID)
 				xids <- d
 			}
 		}