Skip to content

Commit d7a2ee9

Browse files
committed
fix: move driver_version into nvidia_smi_gpu_info labels
Signed-off-by: Utku Ozdemir <[email protected]>
1 parent 27316b4 commit d7a2ee9

File tree

7 files changed

+35
-19
lines changed

7 files changed

+35
-19
lines changed

Diff for: internal/exporter/_query-test.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
uuid, name, driver_model.current, driver_model.pending, vbios_version, fan.speed [%], memory.used [MiB]
2-
GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa, NVIDIA GeForce RTX 2080 SUPER, WDDM, WDDM, 90.04.7a.40.73, 38 %, 575 MiB
3-
GPU-04757e3e-3077-4e2e-b988-7e2d647b52e9, Some Other GPU, DoesntMatter, DoesntMatter, 1a.2b.3c.4d, 50 %, 1234 MiB
1+
uuid, name, driver_model.current, driver_model.pending, vbios_version, driver_version, fan.speed [%], memory.used [MiB]
2+
GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa, NVIDIA GeForce RTX 2080 SUPER, WDDM, WDDM, 90.04.7a.40.73, 466.63, 38 %, 575 MiB
3+
GPU-04757e3e-3077-4e2e-b988-7e2d647b52e9, Some Other GPU, DoesntMatter, DoesntMatter, 1a.2b.3c.4d, 123.45, 50 %, 1234 MiB

Diff for: internal/exporter/csv.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package exporter
22

3-
import "strings"
3+
import (
4+
"fmt"
5+
"strings"
6+
)
47

58
type table struct {
69
rows []row
@@ -19,7 +22,7 @@ type cell struct {
1922
rawValue string
2023
}
2124

22-
func parseCSVIntoTable(queryResult string, qFields []qField) table {
25+
func parseCSVIntoTable(queryResult string, qFields []qField) (table, error) {
2326
lines := strings.Split(strings.TrimSpace(queryResult), "\n")
2427
titlesLine := lines[0]
2528
valuesLines := lines[1:]
@@ -39,6 +42,10 @@ func parseCSVIntoTable(queryResult string, qFields []qField) table {
3942
qFieldToCell := make(map[qField]cell, numCols)
4043
cells := make([]cell, numCols)
4144
rawValues := parseCSVLine(valuesLine)
45+
if len(qFields) != len(rFields) {
46+
return table{}, fmt.Errorf("query fields (%d) and returned fields (%d) have different sizes", len(qFields), len(rFields))
47+
}
48+
4249
for colIndex, rawValue := range rawValues {
4350
q := qFields[colIndex]
4451
r := rFields[colIndex]
@@ -65,7 +72,7 @@ func parseCSVIntoTable(queryResult string, qFields []qField) table {
6572
rows: rows,
6673
rFields: rFields,
6774
qFieldToCells: qFieldToCells,
68-
}
75+
}, nil
6976
}
7077

7178
func parseCSVLine(line string) []string {

Diff for: internal/exporter/csv_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ Some Dummy GPU, 12.34 W
1414
)
1515

1616
func TestParseCsvIntoTable(t *testing.T) {
17-
parsed := parseCSVIntoTable(testCsv, []qField{"name", "power.draw"})
17+
parsed, err := parseCSVIntoTable(testCsv, []qField{"name", "power.draw"})
18+
assert.NoError(t, err)
1819
assert.Len(t, parsed.rows, 2)
1920
assert.Equal(t, []rField{"name", "power.draw [W]"}, parsed.rFields)
2021

Diff for: internal/exporter/exporter.go

+9-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ var (
3333
{qField: driverModelCurrentQField, label: "driver_model_current"},
3434
{qField: driverModelPendingQField, label: "driver_model_pending"},
3535
{qField: vBiosVersionQField, label: "vbios_version"},
36+
{qField: driverVersionQField, label: "driver_version"},
3637
}
3738

3839
runCmd = func(cmd *exec.Cmd) error { return cmd.Run() }
@@ -156,9 +157,11 @@ func (e *gpuExporter) Collect(ch chan<- prometheus.Metric) {
156157
driverModelCurrent := r.qFieldToCells[driverModelCurrentQField].rawValue
157158
driverModelPending := r.qFieldToCells[driverModelPendingQField].rawValue
158159
vBiosVersion := r.qFieldToCells[vBiosVersionQField].rawValue
160+
driverVersion := r.qFieldToCells[driverVersionQField].rawValue
159161

160162
infoMetric := prometheus.MustNewConstMetric(e.gpuInfoDesc, prometheus.GaugeValue,
161-
1, uuid, name, driverModelCurrent, driverModelPending, vBiosVersion)
163+
1, uuid, name, driverModelCurrent,
164+
driverModelPending, vBiosVersion, driverVersion)
162165
ch <- infoMetric
163166

164167
for _, c := range r.cells {
@@ -193,7 +196,11 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
193196
return nil, fmt.Errorf("command failed. stderr: %s err: %w", stderr.String(), err)
194197
}
195198

196-
t := parseCSVIntoTable(strings.TrimSpace(stdout.String()), qFields)
199+
t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), qFields)
200+
if err != nil {
201+
return nil, err
202+
}
203+
197204
return &t, nil
198205
}
199206

Diff for: internal/exporter/exporter_test.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ end:
157157
}
158158
}
159159

160-
assert.Len(t, descStrs, 9)
160+
assert.Len(t, descStrs, 10)
161161
descs := strings.Join(descStrs, "\n")
162162
assert.Contains(t, descs, "aaa_fan_speed")
163163
assert.Contains(t, descs, "aaa_memory_used")
@@ -168,6 +168,7 @@ end:
168168
assert.Contains(t, descs, "aaa_driver_model_current")
169169
assert.Contains(t, descs, "aaa_driver_model_pending")
170170
assert.Contains(t, descs, "aaa_vbios_version")
171+
assert.Contains(t, descs, "aaa_driver_version")
171172
}
172173

173174
func TestCollect(t *testing.T) {
@@ -182,7 +183,7 @@ func TestCollect(t *testing.T) {
182183
logger := log.NewNopLogger()
183184
exp, err := New("aaa", "bbb",
184185
"uuid,name,driver_model.current,driver_model.pending,"+
185-
"vbios_version,fan.speed,memory.used", logger)
186+
"vbios_version,driver_version,fan.speed,memory.used", logger)
186187
assert.NoError(t, err)
187188

188189
doneCh := make(chan bool)
@@ -207,7 +208,7 @@ end:
207208

208209
metricsJoined := strings.Join(metrics, "\n")
209210

210-
assert.Len(t, metrics, 7)
211+
assert.Len(t, metrics, 9)
211212
assert.Contains(t, metricsJoined, "aaa_gpu_info")
212213
assert.Contains(t, metricsJoined, "aaa_name")
213214
assert.Contains(t, metricsJoined, "aaa_fan_speed_ratio")

Diff for: internal/exporter/fields.go

+5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package exporter
22

33
import (
44
"bytes"
5+
"errors"
56
"os/exec"
67
"regexp"
78
"strings"
@@ -13,6 +14,7 @@ const (
1314
driverModelCurrentQField qField = "driver_model.current"
1415
driverModelPendingQField qField = "driver_model.pending"
1516
vBiosVersionQField qField = "vbios_version"
17+
driverVersionQField qField = "driver_version"
1618
qFieldsAuto = "AUTO"
1719
DefaultQField = qFieldsAuto
1820
)
@@ -153,6 +155,9 @@ func ParseAutoQFields(nvidiaSmiCommand string) ([]qField, error) {
153155

154156
out := stdout.String()
155157
fields := extractQFields(out)
158+
if fields == nil {
159+
return nil, errors.New("could not extract any query fields")
160+
}
156161
return fields, nil
157162
}
158163

Diff for: samples/sample-source.sh

+2-7
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
11
#!/usr/bin/env sh
22

3-
4-
5-
6-
echo "timestamp, driver_version, count, name, serial, uuid, pci.bus_id, pci.domain, pci.bus, pci.device, pci.device_id, pci.sub_device_id, pcie.link.gen.current, pcie.link.gen.max, pcie.link.width.current, pcie.link.width.max, index, display_mode, display_active, persistence_mode, accounting.mode, accounting.buffer_size, driver_model.current, driver_model.pending, vbios_version, inforom.img, inforom.oem, inforom.ecc, inforom.pwr, gom.current, gom.pending, fan.speed [%], pstate, clocks_throttle_reasons.supported, clocks_throttle_reasons.active, clocks_throttle_reasons.gpu_idle, clocks_throttle_reasons.applications_clocks_setting, clocks_throttle_reasons.sw_power_cap, clocks_throttle_reasons.hw_slowdown, clocks_throttle_reasons.hw_thermal_slowdown, clocks_throttle_reasons.hw_power_brake_slowdown, clocks_throttle_reasons.sw_thermal_slowdown, clocks_throttle_reasons.sync_boost, memory.total [MiB], memory.used [MiB], memory.free [MiB], compute_mode, utilization.gpu [%], utilization.memory [%], encoder.stats.sessionCount, encoder.stats.averageFps, encoder.stats.averageLatency, ecc.mode.current, ecc.mode.pending, ecc.errors.corrected.volatile.device_memory, ecc.errors.corrected.volatile.dram, ecc.errors.corrected.volatile.register_file, ecc.errors.corrected.volatile.l1_cache, ecc.errors.corrected.volatile.l2_cache, ecc.errors.corrected.volatile.texture_memory, ecc.errors.corrected.volatile.cbu, ecc.errors.corrected.volatile.sram, ecc.errors.corrected.volatile.total, ecc.errors.corrected.aggregate.device_memory, ecc.errors.corrected.aggregate.dram, ecc.errors.corrected.aggregate.register_file, ecc.errors.corrected.aggregate.l1_cache, ecc.errors.corrected.aggregate.l2_cache, ecc.errors.corrected.aggregate.texture_memory, ecc.errors.corrected.aggregate.cbu, ecc.errors.corrected.aggregate.sram, ecc.errors.corrected.aggregate.total, ecc.errors.uncorrected.volatile.device_memory, ecc.errors.uncorrected.volatile.dram, ecc.errors.uncorrected.volatile.register_file, ecc.errors.uncorrected.volatile.l1_cache, ecc.errors.uncorrected.volatile.l2_cache, ecc.errors.uncorrected.volatile.texture_memory, ecc.errors.uncorrected.volatile.cbu, ecc.errors.uncorrected.volatile.sram, ecc.errors.uncorrected.volatile.total, ecc.errors.uncorrected.aggregate.device_memory, ecc.errors.uncorrected.aggregate.dram, ecc.errors.uncorrected.aggregate.register_file, ecc.errors.uncorrected.aggregate.l1_cache, ecc.errors.uncorrected.aggregate.l2_cache, ecc.errors.uncorrected.aggregate.texture_memory, ecc.errors.uncorrected.aggregate.cbu, ecc.errors.uncorrected.aggregate.sram, ecc.errors.uncorrected.aggregate.total, retired_pages.single_bit_ecc.count, retired_pages.double_bit.count, retired_pages.pending, temperature.gpu, temperature.memory, power.management, power.draw [W], power.limit [W], enforced.power.limit [W], power.default_limit [W], power.min_limit [W], power.max_limit [W], clocks.current.graphics [MHz], clocks.current.sm [MHz], clocks.current.memory [MHz], clocks.current.video [MHz], clocks.applications.graphics [MHz], clocks.applications.memory [MHz], clocks.default_applications.graphics [MHz], clocks.default_applications.memory [MHz], clocks.max.graphics [MHz], clocks.max.sm [MHz], clocks.max.memory [MHz], mig.mode.current, mig.mode.pending"
7-
echo "2021/06/09 23:27:38.358, 466.63, 1, NVIDIA GeForce RTX 2080 SUPER, [N/A], GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa, 00000000:0C:00.0, 0x0000, 0x0C, 0x00, 0x1E8110DE, 0x40051458, 1, 3, 16, 16, 0, Enabled, Enabled, [N/A], Disabled, 4000, WDDM, WDDM, 90.04.7A.40.73, G001.0000.02.04, 1.1, [N/A], [N/A], [N/A], [N/A], 38 %, P8, 0x00000000000001FF, 0x0000000000000001, Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, 8192 MiB, 856 MiB, 7336 MiB, Default, 0 %, 10 %, 0, 0, 0, [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], 35, N/A, Enabled, 31.12 W, 250.00 W, 250.00 W, 250.00 W, 105.00 W, 350.00 W, 300 MHz, 300 MHz, 405 MHz, 540 MHz, [N/A], [N/A], [N/A], [N/A], 2145 MHz, 2145 MHz, 7751 MHz, [N/A], [N/A]"
8-
echo "2042/06/09 23:27:38.358, 123.45, 1, Dummy GPU 1, [N/A], GPU-523e933c-cf12-4d85-95b4-dd144e8fc516, 00000000:0C:00.0, 0x0000, 0x0C, 0x00, 0x1E8110DE, 0x40051458, 1, 3, 16, 16, 0, Enabled, Enabled, [N/A], Disabled, 4000, WDDM, WDDM, 90.04.7A.40.73, G001.0000.02.04, 1.1, [N/A], [N/A], [N/A], [N/A], 38 %, P8, 0x00000000000001FF, 0x0000000000000001, Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, 8192 MiB, 856 MiB, 7336 MiB, Default, 0 %, 10 %, 0, 0, 0, [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], 35, N/A, Enabled, 31.12 W, 250.00 W, 250.00 W, 250.00 W, 105.00 W, 350.00 W, 300 MHz, 300 MHz, 405 MHz, 540 MHz, [N/A], [N/A], [N/A], [N/A], 2145 MHz, 2145 MHz, 7751 MHz, [N/A], [N/A]"
9-
echo "2021/06/12 18:45:18.358, 466.63, 1, Dummy GPU 2, [N/A], GPU-b2fe4f12-c3dd-4fa4-914b-9e7b975a0faa, 00000000:0C:00.0, 0x0000, 0x0C, 0x00, 0x1E8110DE, 0x40051458, 1, 3, 16, 16, 0, Enabled, Disabled, [N/A], Disabled, 4000, WDDM, WDDM, 90.04.7A.40.73, G001.0000.02.04, 1.1, [N/A], [N/A], [N/A], [N/A], 38 %, P8, 0x00000000000001FF, 0x0000000000000001, Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, 8192 MiB, 779 MiB, 7413 MiB, Default, 2 %, 0 %, 0, 0, 0, [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], 34, N/A, Enabled, 29.50 W, 250.00 W, 250.00 W, 250.00 W, 105.00 W, 350.00 W, 300 MHz, 300 MHz, 405 MHz, 540 MHz, [N/A], [N/A], [N/A], [N/A], 2235 MHz, 2235 MHz, 7751 MHz, [N/A], [N/A]"
3+
echo "driver_version,uuid,name,driver_model.current,driver_model.pending,vbios_version"
4+
echo "460.91.03,GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa,NVIDIA GeForce RTX 2080 SUPER, WDDM, WDDM, 90.04.7A.40.73"

0 commit comments

Comments
 (0)