From acf861af12da9e3663260a60204bb5d980666e92 Mon Sep 17 00:00:00 2001 From: Ian Kerins Date: Sat, 12 Apr 2025 00:57:10 -0400 Subject: [PATCH] Surface failed ECS API requests to the collector We occasionally see the ECS APIs called by ecs_exporter 500 on the first request or two, so the metrics served by the exporter are nonsense. By using NewInvalidMetric in response to Collect, we can ensure that HTTP 500s are served on /metrics whenever such errors occur. Closes #97. Signed-off-by: Ian Kerins --- ecscollector/collector.go | 21 +++++++++++++++++++-- ecscollector/collector_test.go | 21 +++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/ecscollector/collector.go b/ecscollector/collector.go index c696d44..01fda9c 100644 --- a/ecscollector/collector.go +++ b/ecscollector/collector.go @@ -17,6 +17,7 @@ package ecscollector import ( "context" + "fmt" "log/slog" "github.com/docker/docker/api/types/container" @@ -193,7 +194,15 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { ctx := context.Background() metadata, err := c.client.RetrieveTaskMetadata(ctx) if err != nil { - c.logger.Debug("Failed to retrieve metadata", "error", err) + c.logger.Debug("Failed to retrieve task metadata", "error", err) + // Signal that this Collect has failed. This ultimately results in an + // HTTP 500 being served for the /metrics response. + // + // While it would be most technically correct to do `NewInvalidMetric` + // for all of the Descs here, it just results in N identical error + // messages being printed in the /metrics response, so it seems a bit + // absurd to do that. + ch <- prometheus.NewInvalidMetric(taskMetadataDesc, fmt.Errorf("failed to retrieve task metadata: %w", err)) return } c.logger.Debug("Got ECS task metadata response", "metadata", metadata) @@ -261,7 +270,15 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { stats, err := c.client.RetrieveTaskStats(ctx) if err != nil { - c.logger.Debug("Failed to retrieve container stats", "error", err) + c.logger.Debug("Failed to retrieve task stats", "error", err) + // Signal that this Collect has failed. This ultimately results in an + // HTTP 500 being served for the /metrics response. + // + // While it would be most technically correct to do `NewInvalidMetric` + // for all of the Descs here, it just results in N identical error + // messages being printed in the /metrics response, so it seems a bit + // absurd to do that. + ch <- prometheus.NewInvalidMetric(memUsageDesc, fmt.Errorf("failed to retrieve task stats: %w", err)) return } c.logger.Debug("Got ECS task stats response", "stats", stats) diff --git a/ecscollector/collector_test.go b/ecscollector/collector_test.go index 57e54f5..04ec429 100644 --- a/ecscollector/collector_test.go +++ b/ecscollector/collector_test.go @@ -138,3 +138,24 @@ func TestEc2Metrics(t *testing.T) { collector := NewCollector(metadataClient, slog.Default()) assertSnapshot(t, collector, "testdata/snapshots/ec2_metrics.txt") } + +func TestApiErrors(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("GET /task", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(500) + w.Write([]byte("Internal Server Error")) + }) + mux.HandleFunc("GET /task/stats", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(500) + w.Write([]byte("Internal Server Error")) + }) + metadataServer := httptest.NewServer(mux) + defer metadataServer.Close() + metadataClient := ecsmetadata.NewClient(metadataServer.URL) + collector := NewCollector(metadataClient, slog.Default()) + + _, err := renderMetrics(collector) + if err == nil || err.Error() != "non-200 metrics response: 500" { + t.Fatalf("expected 500 error but got err: %v", err) + } +}