Add vgpu and mps metrics

ghokun · ghokun · commit dbdc3dec633d · 2022-03-24T17:23:58.000+03:00
Signed-off-by: ghokun &lt;gokhun@gmail.com&gt;
diff --git a/manifests/device-plugin.yml b/manifests/device-plugin.yml
@@ -45,7 +45,7 @@ spec:
             capabilities:
               add: ["SYS_ADMIN"]
       containers:
-        - image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.6.2
+        - image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.6.3
           name: kuartis-virtual-gpu-device-plugin-ctr
           command:
             - /usr/bin/virtual-gpu-device-plugin
diff --git a/pkg/gpu/nvidia/metrics.go b/pkg/gpu/nvidia/metrics.go
@@ -29,29 +29,33 @@ var node = os.Getenv("NODE_NAME")
 var metricsFormat = `# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
 # TYPE gpu_memory_usage_per_container gauge
 {{- range $m := . }}
-gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}"} {{ $m.UsedGpuMemory }}
+gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}",vgpucount="{{ $m.VGpuCount }}",mpsactivethread="{{ $m.MpsActiveThread }}"} {{ $m.UsedGpuMemory }}
 {{- end -}}`
 
 type metric struct {
-	Pid           uint32
-	UsedGpuMemory uint64
-	GpuIndex      int
-	GpuUUID       string
-	Node          string
-	Namespace     string
-	Pod           string
-	PodUid        string
-	Container     string
-	ContainerId   string
+	Pid             uint32
+	UsedGpuMemory   uint64
+	GpuIndex        int
+	GpuUUID         string
+	Node            string
+	Namespace       string
+	Pod             string
+	PodUid          string
+	Container       string
+	ContainerId     string
+	VGpuCount       string
+	MpsActiveThread string
 }
 
 type containerInfo struct {
-	Node        string
-	Namespace   string
-	Pod         string
-	PodUid      string
-	Container   string
-	ContainerId string
+	Node            string
+	Namespace       string
+	Pod             string
+	PodUid          string
+	Container       string
+	ContainerId     string
+	VGpuCount       string
+	MpsActiveThread string
 }
 
 func MetricServer() {
@@ -77,12 +81,14 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
 	containerMap := make(map[string]containerInfo)
 	for _, container := range containers.GetContainers() {
 		containerMap[container.GetId()] = containerInfo{
-			Node:        node,
-			Namespace:   container.GetLabels()["io.kubernetes.pod.namespace"],
-			Pod:         container.GetLabels()["io.kubernetes.pod.name"],
-			PodUid:      container.GetLabels()["io.kubernetes.pod.uid"],
-			Container:   container.GetMetadata().GetName(),
-			ContainerId: container.GetId(),
+			Node:            node,
+			Namespace:       container.GetLabels()["io.kubernetes.pod.namespace"],
+			Pod:             container.GetLabels()["io.kubernetes.pod.name"],
+			PodUid:          container.GetLabels()["io.kubernetes.pod.uid"],
+			Container:       container.GetMetadata().GetName(),
+			ContainerId:     container.GetId(),
+			VGpuCount:       container.GetAnnotations()["k8s.kuartis.com/vgpu-count"],
+			MpsActiveThread: container.GetAnnotations()["k8s.kuartis.com/mps-active-thread"],
 		}
 	}
 	collected := []metric{}
@@ -97,16 +103,18 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
 			if container, ok := containerMap[containerId]; ok {
 				log.Printf("Using %s Found container %+v for process: %d", containerId, container, process.Pid)
 				collected = append(collected, metric{
-					Pid:           process.Pid,
-					UsedGpuMemory: process.UsedGpuMemory,
-					GpuIndex:      i,
-					GpuUUID:       getDeviceUUID(d),
-					Node:          container.Node,
-					Namespace:     container.Namespace,
-					Pod:           container.Pod,
-					PodUid:        container.PodUid,
-					Container:     container.Container,
-					ContainerId:   container.ContainerId,
+					Pid:             process.Pid,
+					UsedGpuMemory:   process.UsedGpuMemory,
+					GpuIndex:        i,
+					GpuUUID:         getDeviceUUID(d),
+					Node:            container.Node,
+					Namespace:       container.Namespace,
+					Pod:             container.Pod,
+					PodUid:          container.PodUid,
+					Container:       container.Container,
+					ContainerId:     container.ContainerId,
+					VGpuCount:       container.VGpuCount,
+					MpsActiveThread: container.MpsActiveThread,
 				})
 			}
 		}
diff --git a/pkg/gpu/nvidia/server.go b/pkg/gpu/nvidia/server.go
@@ -290,17 +290,23 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Alloc
 
 		cresp := new(pluginapi.ContainerAllocateResponse)
 
+		cudaActiveThread := fmt.Sprintf("%d", 100*len(req.DevicesIDs)/(len(m.devs)/len(m.physicalDevs)*len(visibleDevs)))
+		visibleDevsStr := strings.Join(visibleDevs, ",")
+		allocatedDeviceIdsStr := strings.Join(req.DevicesIDs, ",")
+
 		cresp.Envs = map[string]string{}
-		cresp.Envs["NVIDIA_VISIBLE_DEVICES"] = strings.Join(visibleDevs, ",")
-		cresp.Envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = fmt.Sprintf("%d", 100*len(req.DevicesIDs)/len(m.devs))
+		cresp.Envs["NVIDIA_VISIBLE_DEVICES"] = visibleDevsStr
+		cresp.Envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = cudaActiveThread
 
 		cresp.Annotations = map[string]string{}
-		cresp.Annotations["k8s.kuartis.com/gpu-ids"] = strings.Join(visibleDevs, ",")
-		cresp.Annotations["k8s.kuartis.com/vgpu-ids"] = strings.Join(req.DevicesIDs, ",")
-
-		log.Printf("Allocated physical devices: %s", strings.Join(visibleDevs, ","))
-		log.Printf("Allocated virtual devices: %s", strings.Join(req.DevicesIDs, ","))
-		log.Printf("Allocated MPS ACTIVE THREAD PERCENTAGE: %s", fmt.Sprintf("%d", 100*len(req.DevicesIDs)/len(m.devs)))
+		cresp.Annotations["k8s.kuartis.com/gpu-ids"] = visibleDevsStr
+		cresp.Annotations["k8s.kuartis.com/vgpu-ids"] = allocatedDeviceIdsStr
+		cresp.Annotations["k8s.kuartis.com/vgpu-count"] = fmt.Sprintf("%d", len(req.DevicesIDs))
+		cresp.Annotations["k8s.kuartis.com/mps-active-thread"] = cudaActiveThread
+
+		log.Printf("Allocated physical devices: %s", visibleDevsStr)
+		log.Printf("Allocated virtual devices: %s", allocatedDeviceIdsStr)
+		log.Printf("Allocated MPS ACTIVE THREAD PERCENTAGE: %s", cudaActiveThread)
 
 		response.ContainerResponses = append(response.ContainerResponses, cresp)
 	}