@@ -29,29 +29,33 @@ var node = os.Getenv("NODE_NAME")
29
29
var metricsFormat = `# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
30
30
# TYPE gpu_memory_usage_per_container gauge
31
31
{{- range $m := . }}
32
- gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}"} {{ $m.UsedGpuMemory }}
32
+ gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}",vgpucount="{{ $m.VGpuCount }}",mpsactivethread="{{ $m.MpsActiveThread }}" } {{ $m.UsedGpuMemory }}
33
33
{{- end -}}`
34
34
35
35
type metric struct {
36
- Pid uint32
37
- UsedGpuMemory uint64
38
- GpuIndex int
39
- GpuUUID string
40
- Node string
41
- Namespace string
42
- Pod string
43
- PodUid string
44
- Container string
45
- ContainerId string
36
+ Pid uint32
37
+ UsedGpuMemory uint64
38
+ GpuIndex int
39
+ GpuUUID string
40
+ Node string
41
+ Namespace string
42
+ Pod string
43
+ PodUid string
44
+ Container string
45
+ ContainerId string
46
+ VGpuCount string
47
+ MpsActiveThread string
46
48
}
47
49
48
50
type containerInfo struct {
49
- Node string
50
- Namespace string
51
- Pod string
52
- PodUid string
53
- Container string
54
- ContainerId string
51
+ Node string
52
+ Namespace string
53
+ Pod string
54
+ PodUid string
55
+ Container string
56
+ ContainerId string
57
+ VGpuCount string
58
+ MpsActiveThread string
55
59
}
56
60
57
61
func MetricServer () {
@@ -77,12 +81,14 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
77
81
containerMap := make (map [string ]containerInfo )
78
82
for _ , container := range containers .GetContainers () {
79
83
containerMap [container .GetId ()] = containerInfo {
80
- Node : node ,
81
- Namespace : container .GetLabels ()["io.kubernetes.pod.namespace" ],
82
- Pod : container .GetLabels ()["io.kubernetes.pod.name" ],
83
- PodUid : container .GetLabels ()["io.kubernetes.pod.uid" ],
84
- Container : container .GetMetadata ().GetName (),
85
- ContainerId : container .GetId (),
84
+ Node : node ,
85
+ Namespace : container .GetLabels ()["io.kubernetes.pod.namespace" ],
86
+ Pod : container .GetLabels ()["io.kubernetes.pod.name" ],
87
+ PodUid : container .GetLabels ()["io.kubernetes.pod.uid" ],
88
+ Container : container .GetMetadata ().GetName (),
89
+ ContainerId : container .GetId (),
90
+ VGpuCount : container .GetAnnotations ()["k8s.kuartis.com/vgpu-count" ],
91
+ MpsActiveThread : container .GetAnnotations ()["k8s.kuartis.com/mps-active-thread" ],
86
92
}
87
93
}
88
94
collected := []metric {}
@@ -97,16 +103,18 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
97
103
if container , ok := containerMap [containerId ]; ok {
98
104
log .Printf ("Using %s Found container %+v for process: %d" , containerId , container , process .Pid )
99
105
collected = append (collected , metric {
100
- Pid : process .Pid ,
101
- UsedGpuMemory : process .UsedGpuMemory ,
102
- GpuIndex : i ,
103
- GpuUUID : getDeviceUUID (d ),
104
- Node : container .Node ,
105
- Namespace : container .Namespace ,
106
- Pod : container .Pod ,
107
- PodUid : container .PodUid ,
108
- Container : container .Container ,
109
- ContainerId : container .ContainerId ,
106
+ Pid : process .Pid ,
107
+ UsedGpuMemory : process .UsedGpuMemory ,
108
+ GpuIndex : i ,
109
+ GpuUUID : getDeviceUUID (d ),
110
+ Node : container .Node ,
111
+ Namespace : container .Namespace ,
112
+ Pod : container .Pod ,
113
+ PodUid : container .PodUid ,
114
+ Container : container .Container ,
115
+ ContainerId : container .ContainerId ,
116
+ VGpuCount : container .VGpuCount ,
117
+ MpsActiveThread : container .MpsActiveThread ,
110
118
})
111
119
}
112
120
}
0 commit comments