Skip to content

Commit 99f3c37

Browse files
add additional stats from mdstat (#380)
* Add several Infiniband counters Counters added: * excessive_buffer_overrun_errors * local_link_integrity_errors Signed-off-by: Trey Dockendorf <[email protected]> Signed-off-by: John Seekins <[email protected]> * add additional stats from mdstat Signed-off-by: John Seekins <[email protected]> * return successful values every time Signed-off-by: John Seekins <[email protected]> * add count of 'downed' disks Signed-off-by: John Seekins <[email protected]> Co-authored-by: Trey Dockendorf <[email protected]>
1 parent 70418d8 commit 99f3c37

File tree

2 files changed

+94
-45
lines changed

2 files changed

+94
-45
lines changed

mdstat.go

Lines changed: 77 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,12 @@ import (
2222
)
2323

2424
var (
25-
statusLineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[[U_]+\]`)
26-
recoveryLineRE = regexp.MustCompile(`\((\d+)/\d+\)`)
27-
componentDeviceRE = regexp.MustCompile(`(.*)\[\d+\]`)
25+
statusLineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[([U_]+)\]`)
26+
recoveryLineBlocksRE = regexp.MustCompile(`\((\d+)/\d+\)`)
27+
recoveryLinePctRE = regexp.MustCompile(`= (.+)%`)
28+
recoveryLineFinishRE = regexp.MustCompile(`finish=(.+)min`)
29+
recoveryLineSpeedRE = regexp.MustCompile(`speed=(.+)[A-Z]`)
30+
componentDeviceRE = regexp.MustCompile(`(.*)\[\d+\]`)
2831
)
2932

3033
// MDStat holds info parsed from /proc/mdstat.
@@ -39,12 +42,20 @@ type MDStat struct {
3942
DisksTotal int64
4043
// Number of failed disks.
4144
DisksFailed int64
45+
// Number of "down" disks. (the _ indicator in the status line)
46+
DisksDown int64
4247
// Spare disks in the device.
4348
DisksSpare int64
4449
// Number of blocks the device holds.
4550
BlocksTotal int64
4651
// Number of blocks on the device that are in sync.
4752
BlocksSynced int64
53+
// progress percentage of current sync
54+
BlocksSyncedPct float64
55+
// estimated finishing time for current sync (in minutes)
56+
BlocksSyncedFinishTime float64
57+
// current sync speed (in Kilobytes/sec)
58+
BlocksSyncedSpeed float64
4859
// Name of md component devices
4960
Devices []string
5061
}
@@ -91,7 +102,7 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
91102
// Failed disks have the suffix (F) & Spare disks have the suffix (S).
92103
fail := int64(strings.Count(line, "(F)"))
93104
spare := int64(strings.Count(line, "(S)"))
94-
active, total, size, err := evalStatusLine(lines[i], lines[i+1])
105+
active, total, down, size, err := evalStatusLine(lines[i], lines[i+1])
95106

96107
if err != nil {
97108
return nil, fmt.Errorf("error parsing md device lines: %w", err)
@@ -105,6 +116,9 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
105116
// If device is syncing at the moment, get the number of currently
106117
// synced bytes, otherwise that number equals the size of the device.
107118
syncedBlocks := size
119+
speed := float64(0)
120+
finish := float64(0)
121+
pct := float64(0)
108122
recovering := strings.Contains(lines[syncLineIdx], "recovery")
109123
resyncing := strings.Contains(lines[syncLineIdx], "resync")
110124
checking := strings.Contains(lines[syncLineIdx], "check")
@@ -124,77 +138,112 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
124138
strings.Contains(lines[syncLineIdx], "DELAYED") {
125139
syncedBlocks = 0
126140
} else {
127-
syncedBlocks, err = evalRecoveryLine(lines[syncLineIdx])
141+
syncedBlocks, pct, finish, speed, err = evalRecoveryLine(lines[syncLineIdx])
128142
if err != nil {
129143
return nil, fmt.Errorf("error parsing sync line in md device %q: %w", mdName, err)
130144
}
131145
}
132146
}
133147

134148
mdStats = append(mdStats, MDStat{
135-
Name: mdName,
136-
ActivityState: state,
137-
DisksActive: active,
138-
DisksFailed: fail,
139-
DisksSpare: spare,
140-
DisksTotal: total,
141-
BlocksTotal: size,
142-
BlocksSynced: syncedBlocks,
143-
Devices: evalComponentDevices(deviceFields),
149+
Name: mdName,
150+
ActivityState: state,
151+
DisksActive: active,
152+
DisksFailed: fail,
153+
DisksDown: down,
154+
DisksSpare: spare,
155+
DisksTotal: total,
156+
BlocksTotal: size,
157+
BlocksSynced: syncedBlocks,
158+
BlocksSyncedPct: pct,
159+
BlocksSyncedFinishTime: finish,
160+
BlocksSyncedSpeed: speed,
161+
Devices: evalComponentDevices(deviceFields),
144162
})
145163
}
146164

147165
return mdStats, nil
148166
}
149167

150-
func evalStatusLine(deviceLine, statusLine string) (active, total, size int64, err error) {
168+
func evalStatusLine(deviceLine, statusLine string) (active, total, down, size int64, err error) {
151169

152170
sizeStr := strings.Fields(statusLine)[0]
153171
size, err = strconv.ParseInt(sizeStr, 10, 64)
154172
if err != nil {
155-
return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
173+
return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
156174
}
157175

158176
if strings.Contains(deviceLine, "raid0") || strings.Contains(deviceLine, "linear") {
159177
// In the device deviceLine, only disks have a number associated with them in [].
160178
total = int64(strings.Count(deviceLine, "["))
161-
return total, total, size, nil
179+
return total, total, 0, size, nil
162180
}
163181

164182
if strings.Contains(deviceLine, "inactive") {
165-
return 0, 0, size, nil
183+
return 0, 0, 0, size, nil
166184
}
167185

168186
matches := statusLineRE.FindStringSubmatch(statusLine)
169-
if len(matches) != 4 {
170-
return 0, 0, 0, fmt.Errorf("couldn't find all the substring matches: %s", statusLine)
187+
if len(matches) != 5 {
188+
return 0, 0, 0, 0, fmt.Errorf("couldn't find all the substring matches: %s", statusLine)
171189
}
172190

173191
total, err = strconv.ParseInt(matches[2], 10, 64)
174192
if err != nil {
175-
return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
193+
return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
176194
}
177195

178196
active, err = strconv.ParseInt(matches[3], 10, 64)
179197
if err != nil {
180-
return 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
198+
return 0, 0, 0, 0, fmt.Errorf("unexpected statusLine %q: %w", statusLine, err)
181199
}
200+
down = int64(strings.Count(matches[4], "_"))
182201

183-
return active, total, size, nil
202+
return active, total, down, size, nil
184203
}
185204

186-
func evalRecoveryLine(recoveryLine string) (syncedBlocks int64, err error) {
187-
matches := recoveryLineRE.FindStringSubmatch(recoveryLine)
205+
func evalRecoveryLine(recoveryLine string) (syncedBlocks int64, pct float64, finish float64, speed float64, err error) {
206+
matches := recoveryLineBlocksRE.FindStringSubmatch(recoveryLine)
188207
if len(matches) != 2 {
189-
return 0, fmt.Errorf("unexpected recoveryLine: %s", recoveryLine)
208+
return 0, 0, 0, 0, fmt.Errorf("unexpected recoveryLine: %s", recoveryLine)
190209
}
191210

192211
syncedBlocks, err = strconv.ParseInt(matches[1], 10, 64)
193212
if err != nil {
194-
return 0, fmt.Errorf("error parsing int from recoveryLine %q: %w", recoveryLine, err)
213+
return 0, 0, 0, 0, fmt.Errorf("error parsing int from recoveryLine %q: %w", recoveryLine, err)
195214
}
196215

197-
return syncedBlocks, nil
216+
// Get percentage complete
217+
matches = recoveryLinePctRE.FindStringSubmatch(recoveryLine)
218+
if len(matches) != 2 {
219+
return syncedBlocks, 0, 0, 0, fmt.Errorf("unexpected recoveryLine matching percentage: %s", recoveryLine)
220+
}
221+
pct, err = strconv.ParseFloat(strings.TrimSpace(matches[1]), 64)
222+
if err != nil {
223+
return syncedBlocks, 0, 0, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err)
224+
}
225+
226+
// Get time expected left to complete
227+
matches = recoveryLineFinishRE.FindStringSubmatch(recoveryLine)
228+
if len(matches) != 2 {
229+
return syncedBlocks, pct, 0, 0, fmt.Errorf("unexpected recoveryLine matching est. finish time: %s", recoveryLine)
230+
}
231+
finish, err = strconv.ParseFloat(matches[1], 64)
232+
if err != nil {
233+
return syncedBlocks, pct, 0, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err)
234+
}
235+
236+
// Get recovery speed
237+
matches = recoveryLineSpeedRE.FindStringSubmatch(recoveryLine)
238+
if len(matches) != 2 {
239+
return syncedBlocks, pct, finish, 0, fmt.Errorf("unexpected recoveryLine matching speed: %s", recoveryLine)
240+
}
241+
speed, err = strconv.ParseFloat(matches[1], 64)
242+
if err != nil {
243+
return syncedBlocks, pct, finish, 0, fmt.Errorf("error parsing float from recoveryLine %q: %w", recoveryLine, err)
244+
}
245+
246+
return syncedBlocks, pct, finish, speed, nil
198247
}
199248

200249
func evalComponentDevices(deviceFields []string) []string {

mdstat_test.go

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,23 +25,23 @@ func TestFS_MDStat(t *testing.T) {
2525
}
2626

2727
refs := map[string]MDStat{
28-
"md127": {Name: "md127", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 312319552, BlocksSynced: 312319552, Devices: []string{"sdi2", "sdj2"}},
29-
"md0": {Name: "md0", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 248896, BlocksSynced: 248896, Devices: []string{"sdi1", "sdj1"}},
30-
"md4": {Name: "md4", ActivityState: "inactive", DisksActive: 0, DisksTotal: 0, DisksFailed: 1, DisksSpare: 1, BlocksTotal: 4883648, BlocksSynced: 4883648, Devices: []string{"sda3", "sdb3"}},
31-
"md6": {Name: "md6", ActivityState: "recovering", DisksActive: 1, DisksTotal: 2, DisksFailed: 1, DisksSpare: 1, BlocksTotal: 195310144, BlocksSynced: 16775552, Devices: []string{"sdb2", "sdc", "sda2"}},
32-
"md3": {Name: "md3", ActivityState: "active", DisksActive: 8, DisksTotal: 8, DisksFailed: 0, DisksSpare: 2, BlocksTotal: 5853468288, BlocksSynced: 5853468288, Devices: []string{"sda1", "sdh1", "sdg1", "sdf1", "sde1", "sdd1", "sdc1", "sdb1", "sdd1", "sdd2"}},
33-
"md8": {Name: "md8", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 2, BlocksTotal: 195310144, BlocksSynced: 16775552, Devices: []string{"sdb1", "sda1", "sdc", "sde"}},
34-
"md7": {Name: "md7", ActivityState: "active", DisksActive: 3, DisksTotal: 4, DisksFailed: 1, DisksSpare: 0, BlocksTotal: 7813735424, BlocksSynced: 7813735424, Devices: []string{"sdb1", "sde1", "sdd1", "sdc1"}},
35-
"md9": {Name: "md9", ActivityState: "resyncing", DisksActive: 4, DisksTotal: 4, DisksSpare: 1, DisksFailed: 2, BlocksTotal: 523968, BlocksSynced: 0, Devices: []string{"sdc2", "sdd2", "sdb2", "sda2", "sde", "sdf", "sdg"}},
36-
"md10": {Name: "md10", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 314159265, BlocksSynced: 314159265, Devices: []string{"sda1", "sdb1"}},
37-
"md11": {Name: "md11", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 1, DisksSpare: 2, BlocksTotal: 4190208, BlocksSynced: 0, Devices: []string{"sdb2", "sdc2", "sdc3", "hda", "ssdc2"}},
38-
"md12": {Name: "md12", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksSpare: 0, DisksFailed: 0, BlocksTotal: 3886394368, BlocksSynced: 3886394368, Devices: []string{"sdc2", "sdd2"}},
39-
"md120": {Name: "md120", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 2095104, BlocksSynced: 2095104, Devices: []string{"sda1", "sdb1"}},
40-
"md126": {Name: "md126", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 1855870976, BlocksSynced: 1855870976, Devices: []string{"sdb", "sdc"}},
41-
"md219": {Name: "md219", ActivityState: "inactive", DisksTotal: 0, DisksFailed: 0, DisksActive: 0, DisksSpare: 3, BlocksTotal: 7932, BlocksSynced: 7932, Devices: []string{"sdc", "sda"}},
42-
"md00": {Name: "md00", ActivityState: "active", DisksActive: 1, DisksTotal: 1, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 4186624, BlocksSynced: 4186624, Devices: []string{"xvdb"}},
43-
"md101": {Name: "md101", ActivityState: "active", DisksActive: 3, DisksTotal: 3, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 322560, BlocksSynced: 322560, Devices: []string{"sdb", "sdd", "sdc"}},
44-
"md201": {Name: "md201", ActivityState: "checking", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksSpare: 0, BlocksTotal: 1993728, BlocksSynced: 114176, Devices: []string{"sda3", "sdb3"}},
28+
"md127": {Name: "md127", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 312319552, BlocksSynced: 312319552, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdi2", "sdj2"}},
29+
"md0": {Name: "md0", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 248896, BlocksSynced: 248896, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdi1", "sdj1"}},
30+
"md4": {Name: "md4", ActivityState: "inactive", DisksActive: 0, DisksTotal: 0, DisksFailed: 1, DisksDown: 0, DisksSpare: 1, BlocksTotal: 4883648, BlocksSynced: 4883648, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda3", "sdb3"}},
31+
"md6": {Name: "md6", ActivityState: "recovering", DisksActive: 1, DisksTotal: 2, DisksFailed: 1, DisksDown: 1, DisksSpare: 1, BlocksTotal: 195310144, BlocksSynced: 16775552, BlocksSyncedPct: 8.5, BlocksSyncedFinishTime: 17, BlocksSyncedSpeed: 259783, Devices: []string{"sdb2", "sdc", "sda2"}},
32+
"md3": {Name: "md3", ActivityState: "active", DisksActive: 8, DisksTotal: 8, DisksFailed: 0, DisksDown: 0, DisksSpare: 2, BlocksTotal: 5853468288, BlocksSynced: 5853468288, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdh1", "sdg1", "sdf1", "sde1", "sdd1", "sdc1", "sdb1", "sdd1", "sdd2"}},
33+
"md8": {Name: "md8", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 2, BlocksTotal: 195310144, BlocksSynced: 16775552, BlocksSyncedPct: 8.5, BlocksSyncedFinishTime: 17, BlocksSyncedSpeed: 259783, Devices: []string{"sdb1", "sda1", "sdc", "sde"}},
34+
"md7": {Name: "md7", ActivityState: "active", DisksActive: 3, DisksTotal: 4, DisksFailed: 1, DisksDown: 1, DisksSpare: 0, BlocksTotal: 7813735424, BlocksSynced: 7813735424, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb1", "sde1", "sdd1", "sdc1"}},
35+
"md9": {Name: "md9", ActivityState: "resyncing", DisksActive: 4, DisksTotal: 4, DisksSpare: 1, DisksDown: 0, DisksFailed: 2, BlocksTotal: 523968, BlocksSynced: 0, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc2", "sdd2", "sdb2", "sda2", "sde", "sdf", "sdg"}},
36+
"md10": {Name: "md10", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 314159265, BlocksSynced: 314159265, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdb1"}},
37+
"md11": {Name: "md11", ActivityState: "resyncing", DisksActive: 2, DisksTotal: 2, DisksFailed: 1, DisksDown: 0, DisksSpare: 2, BlocksTotal: 4190208, BlocksSynced: 0, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb2", "sdc2", "sdc3", "hda", "ssdc2"}},
38+
"md12": {Name: "md12", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksSpare: 0, DisksDown: 0, DisksFailed: 0, BlocksTotal: 3886394368, BlocksSynced: 3886394368, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc2", "sdd2"}},
39+
"md120": {Name: "md120", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 2095104, BlocksSynced: 2095104, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sda1", "sdb1"}},
40+
"md126": {Name: "md126", ActivityState: "active", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 1855870976, BlocksSynced: 1855870976, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb", "sdc"}},
41+
"md219": {Name: "md219", ActivityState: "inactive", DisksTotal: 0, DisksFailed: 0, DisksActive: 0, DisksDown: 0, DisksSpare: 3, BlocksTotal: 7932, BlocksSynced: 7932, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdc", "sda"}},
42+
"md00": {Name: "md00", ActivityState: "active", DisksActive: 1, DisksTotal: 1, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 4186624, BlocksSynced: 4186624, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"xvdb"}},
43+
"md101": {Name: "md101", ActivityState: "active", DisksActive: 3, DisksTotal: 3, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 322560, BlocksSynced: 322560, BlocksSyncedPct: 0, BlocksSyncedFinishTime: 0, BlocksSyncedSpeed: 0, Devices: []string{"sdb", "sdd", "sdc"}},
44+
"md201": {Name: "md201", ActivityState: "checking", DisksActive: 2, DisksTotal: 2, DisksFailed: 0, DisksDown: 0, DisksSpare: 0, BlocksTotal: 1993728, BlocksSynced: 114176, BlocksSyncedPct: 5.7, BlocksSyncedFinishTime: 0.2, BlocksSyncedSpeed: 114176, Devices: []string{"sda3", "sdb3"}},
4545
}
4646

4747
if want, have := len(refs), len(mdStats); want != have {

0 commit comments

Comments
 (0)