Skip to content

Commit

Permalink
Add mtr_runs_total metric and track failed runs
Browse files Browse the repository at this point in the history
The new metric `mtr_runs_total` keeps track of how often a
certain job of `mtr` was launched / returned. To allow detecting
issues in the network (for which `mtr` returns "only" an error message),
the error message of `mtr` is used as "error" label. A successful run
is lacking the "error" label. Thus:

* `sum by(mtr_exporter_job)(mtr_runs_total{})` provides the absolute total
  number of `mtr` runs
* `sum by(mtr_exporter, error)(mtr_runs_total{error!=""})` provides the
  failed runs
* the diff between the previous queries are the amount of successful
  runs

The error message of `mtr` is "normalized": all chars in the original
`mtr` message are lower cased, only [0-9,a-z,- ] are kept, everything
else is mapped to `-`.

Also, when no job has returned something yet, the metric exporter
would not report uninitialized values.
  • Loading branch information
mgumz committed Feb 7, 2025
1 parent b7ade29 commit ca39916
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 13 deletions.
10 changes: 8 additions & 2 deletions pkg/job/cron.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package job

import (
"fmt"
"log"
)

Expand All @@ -12,6 +13,11 @@ func (job *Job) Run() {
log.Printf("info: %q failed: %s", job.Label, err)
return
}
log.Printf("info: %q done: %d hops in %s.", job.Label,
len(job.Report.Hubs), job.Duration)

errMsg := ""
if job.Report.ErrorMsg != "" {
errMsg = fmt.Sprintf("(err: %q)", job.Report.ErrorMsg)
}
log.Printf("info: %q done%s: %d hops in %s.", job.Label,
errMsg, len(job.Report.Hubs), job.Duration)
}
42 changes: 35 additions & 7 deletions pkg/job/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@ type JobMeta struct {
Schedule string
Label string
CmdLine string

Runs map[string]int64
}

func (jm *JobMeta) DataAvailable() bool { return len(jm.Runs) > 0 }

type Job struct {
JobMeta

Expand All @@ -30,14 +34,15 @@ type Job struct {

func NewJob(mtr string, args []string, schedule string) *Job {
extra := []string{
"-j", // json output
"-j", // JSON output
}
args = append(extra, args...)
job := Job{
args: args,
mtrBinary: mtr,
cmdLine: strings.Join(append([]string{mtr}, args...), " "),
}
job.JobMeta.Runs = map[string]int64{}
job.JobMeta.Schedule = schedule
job.JobMeta.CmdLine = job.cmdLine
return &job
Expand All @@ -50,22 +55,28 @@ func (job *Job) Launch() error {
cmd := exec.Command(job.mtrBinary, job.args...)

// launch mtr
buf := bytes.Buffer{}
cmd.Stdout = &buf
bufStdout, bufStderr := bytes.Buffer{}, bytes.Buffer{}
cmd.Stdout, cmd.Stderr = &bufStdout, &bufStderr
launched := time.Now()
if err := cmd.Run(); err != nil {
return err
}
cmd.Run()
duration := time.Since(launched)

errMsg := normalizeMtrErrorMsg(bufStderr.String())
if val, exists := job.Runs[errMsg]; exists {
job.Runs[errMsg] = val + 1
} else {
job.Runs[errMsg] = 1
}

// decode the report
report := mtr.Report{}
if err := report.Decode(&buf); err != nil {
if err := report.Decode(&bufStdout); err != nil {
return err
}

// copy the report into the job
job.JobMeta.Report = report
job.JobMeta.Report.ErrorMsg = errMsg
job.JobMeta.Launched = launched
job.JobMeta.Duration = duration

Expand All @@ -76,3 +87,20 @@ func (job *Job) Launch() error {
// done.
return nil
}

func normalizeMtrErrorMsg(msg string) string {
mf := func(r rune) rune {
switch {
case r == ' ' || r == ':':
return r
case r >= '0' && r <= '9':
return r
case r >= 'a' && r <= 'z':
return r
case r >= 'A' && r <= 'Z':
return 'a' + (r - 'A')
}
return '-'
}
return strings.Map(mf, strings.TrimSpace(msg))
}
22 changes: 22 additions & 0 deletions pkg/job/job_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package job

import (
"testing"
)

func Test_NormalizeMtrErrorMsg(t *testing.T) {

fixtures := []struct {
msg string
normalized string
}{
{"mtr: Unexpected mtr-packet error", "mtr: unexpected mtr-packet error"},
}

for i := range fixtures {
normalized := normalizeMtrErrorMsg(fixtures[i].msg)
if normalized != fixtures[i].normalized {
t.Fatalf("expected %q for %q, got %q", fixtures[i].normalized, fixtures[i].msg, normalized)
}
}
}
19 changes: 17 additions & 2 deletions pkg/job/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ func (c *Collector) ServeHTTP(w http.ResponseWriter, r *http.Request) {
c.mu.Lock()
defer c.mu.Unlock()

fmt.Fprintln(w, "# HELP mtr_runs_total number of mtr runs")
fmt.Fprintln(w, "# TYPE mtr_runs_total counter")
fmt.Fprintln(w, "# HELP mtr_report_duration_seconds duration of last mtr run (in seconds)")
fmt.Fprintln(w, "# TYPE mtr_report_duration_seconds gauge")
fmt.Fprintln(w, "# HELP mtr_report_count_hubs number of hops visited in the last mtr run")
Expand All @@ -42,7 +44,7 @@ func (c *Collector) ServeHTTP(w http.ResponseWriter, r *http.Request) {

for _, job := range c.jobs {

if len(job.Report.Hubs) == 0 {
if !job.DataAvailable() {
continue
}

Expand All @@ -57,16 +59,29 @@ func (c *Collector) ServeHTTP(w http.ResponseWriter, r *http.Request) {
labels["mtr_exporter_job"] = job.Label
tsMs := ts.UnixNano() / int64(time.Millisecond)

fmt.Fprintf(w, "# mtr run %s: %s -- %s\n", job.Label, ts.Format(time.RFC3339Nano), job.CmdLine)
errMsg := ""
if report.ErrorMsg != "" {
errMsg = fmt.Sprintf(" # (err: %q)", report.ErrorMsg)
}
fmt.Fprintf(w, "# mtr run %s: %s -- %s%s\n", job.Label, ts.Format(time.RFC3339Nano), job.CmdLine, errMsg)

l := labels2Prom(labels)

for k, v := range job.Runs {
fmt.Fprintf(w, "mtr_runs_total{%s%s} %d %d\n",
l, fmt.Sprintf("error=%q", k), v, tsMs)
}

fmt.Fprintf(w, "mtr_report_duration_seconds{%s} %f %d\n",
l, float64(d)/float64(time.Second), tsMs)

fmt.Fprintf(w, "mtr_report_count_hubs{%s} %d %d\n",
l, len(report.Hubs), tsMs)

if len(job.Report.Hubs) == 0 {
continue
}

lh := len(report.Hubs) - 1
for i, hub := range report.Hubs {
labels["host"] = hub.Host
Expand Down
5 changes: 3 additions & 2 deletions pkg/mtr/mtr.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ type Result struct {
}

type Report struct {
Mtr Mtr `json:"mtr"`
Hubs []Hub `json:"hubs"`
Mtr Mtr `json:"mtr"`
Hubs []Hub `json:"hubs"`
ErrorMsg string // carrying the error message of mtr
}

type Mtr struct {
Expand Down
26 changes: 26 additions & 0 deletions pkg/mtr/mtr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,32 @@ func Test_MtrReportDecoding(t *testing.T) {
}
}

func Test_MtrEmptyHubs(t *testing.T) {
body := `
{
"report": {
"mtr": {
"src": "example-src.test",
"dst": "example-dst.invalid",
"tos": 0,
"tests": 10,
"psize": "64",
"bitpattern": "0x00"
},
"hubs": []
}
}`

report := &Report{}
if err := report.Decode(strings.NewReader(body)); err != nil {
t.Fatalf("error decoding: %s\n%s", err, body)
}

if len(report.Hubs) != 0 {
t.Fatalf("error: expected [] hubs, got %d", len(report.Hubs))
}
}

func Test_MtrJSONDecoding(t *testing.T) {

fixtures := []string{
Expand Down

0 comments on commit ca39916

Please sign in to comment.