From 377c44f6dbcb241dfd105ffc6eb134045a4c1a3e Mon Sep 17 00:00:00 2001 From: Carlos Panato Date: Thu, 4 Mar 2021 14:08:51 +0100 Subject: [PATCH 1/2] aws-janitor: add job duration metric --- cmd/aws-janitor/main.go | 85 +++++++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 12 deletions(-) diff --git a/cmd/aws-janitor/main.go b/cmd/aws-janitor/main.go index e46f4b34..0a4c7df3 100644 --- a/cmd/aws-janitor/main.go +++ b/cmd/aws-janitor/main.go @@ -19,13 +19,19 @@ package main import ( "flag" "fmt" + "os" + "runtime" "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/push" "github.com/sirupsen/logrus" + "k8s.io/test-infra/prow/logrusutil" + "sigs.k8s.io/boskos/aws-janitor/account" "sigs.k8s.io/boskos/aws-janitor/regions" "sigs.k8s.io/boskos/aws-janitor/resources" @@ -34,16 +40,23 @@ import ( ) var ( - maxTTL = flag.Duration("ttl", 24*time.Hour, "Maximum time before attempting to delete a resource. Set to 0s to nuke all non-default resources.") - region = flag.String("region", "", "The region to clean (otherwise defaults to all regions)") - path = flag.String("path", "", "S3 path for mark data (required when -all=false)") - cleanAll = flag.Bool("all", false, "Clean all resources (ignores -path)") - logLevel = flag.String("log-level", "info", fmt.Sprintf("Log level is one of %v.", logrus.AllLevels)) - dryRun = flag.Bool("dry-run", false, "If set, don't delete any resources, only log what would be done") - ttlTagKey = flag.String("ttl-tag-key", "", "If set, allow resources to use a tag with this key to override TTL") + maxTTL = flag.Duration("ttl", 24*time.Hour, "Maximum time before attempting to delete a resource. Set to 0s to nuke all non-default resources.") + region = flag.String("region", "", "The region to clean (otherwise defaults to all regions)") + path = flag.String("path", "", "S3 path for mark data (required when -all=false)") + cleanAll = flag.Bool("all", false, "Clean all resources (ignores -path)") + logLevel = flag.String("log-level", "info", fmt.Sprintf("Log level is one of %v.", logrus.AllLevels)) + dryRun = flag.Bool("dry-run", false, "If set, don't delete any resources, only log what would be done") + ttlTagKey = flag.String("ttl-tag-key", "", "If set, allow resources to use a tag with this key to override TTL") + pushGateway = flag.String("push-gateway", "", "If specified, push prometheus metrics to this endpoint.") excludeTags common.CommaSeparatedStrings includeTags common.CommaSeparatedStrings + + cleaningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "aws_janitor_job_duration_time_seconds", + ConstLabels: prometheus.Labels{}, + Buckets: prometheus.ExponentialBuckets(1, 1.4, 30), + }, []string{"type", "status"}) ) func init() { @@ -63,6 +76,26 @@ func main() { } logrus.SetLevel(level) + // If Prometheus PushGateway is configured, then before exit push the metric + // to the PushGateway instance, otherwise just exit + exitCode := 2 + startProcess := time.Now() + if *pushGateway != "" { + jobName := fmt.Sprintf("aws-janitor-%s", common.GenerateDynamicResourceName()) + registry := prometheus.NewRegistry() + registry.MustRegister(cleaningTimeHistogram) + pusher := push.New(*pushGateway, jobName).Gatherer(registry) + + defer func() { + pushMetricBeforeExit(pusher, startProcess, *cleanAll, exitCode) + os.Exit(exitCode) + }() + } else { + defer func() { + os.Exit(exitCode) + }() + } + // Retry aggressively (with default back-off). If the account is // in a really bad state, we may be contending with API rate // limiting and fighting against the very resources we're trying @@ -70,17 +103,20 @@ func main() { sess := session.Must(session.NewSessionWithOptions(session.Options{Config: aws.Config{MaxRetries: aws.Int(100)}})) acct, err := account.GetAccount(sess, regions.Default) if err != nil { - logrus.Fatalf("Failed retrieving account: %v", err) + logrus.Errorf("Failed retrieving account: %v", err) + runtime.Goexit() } logrus.Debugf("account: %s", acct) excludeTM, err := resources.TagMatcherForTags(excludeTags) if err != nil { - logrus.Fatalf("Error parsing --exclude-tags: %v", err) + logrus.Errorf("Error parsing --exclude-tags: %v", err) + runtime.Goexit() } includeTM, err := resources.TagMatcherForTags(includeTags) if err != nil { - logrus.Fatalf("Error parsing --include-tags: %v", err) + logrus.Errorf("Error parsing --include-tags: %v", err) + runtime.Goexit() } opts := resources.Options{ @@ -94,11 +130,15 @@ func main() { if *cleanAll { if err := resources.CleanAll(opts, *region); err != nil { - logrus.Fatalf("Error cleaning all resources: %v", err) + logrus.Errorf("Error cleaning all resources: %v", err) + runtime.Goexit() } } else if err := markAndSweep(opts, *region); err != nil { - logrus.Fatalf("Error marking and sweeping resources: %v", err) + logrus.Errorf("Error marking and sweeping resources: %v", err) + runtime.Goexit() } + + exitCode = 0 } func markAndSweep(opts resources.Options, region string) error { @@ -143,3 +183,24 @@ func markAndSweep(opts resources.Options, region string) error { return nil } + +func pushMetricBeforeExit(pusher *push.Pusher, startTime time.Time, jobType bool, exitCode int) { + // Set the status of the job + status := "failed" + if exitCode == 0 { + status = "success" + } + + // Set the type of the job to report the metric + job := "sweep" + if jobType { + job = "clean" + } + + duration := time.Since(startTime).Seconds() + cleaningTimeHistogram.WithLabelValues(job, status).Observe(duration) + + if err := pusher.Add(); err != nil { + logrus.Errorf("Could not push to Pushgateway: %v", err) + } +} From 7553aa1979bca250c26012bb25086e7b4ea2e8f2 Mon Sep 17 00:00:00 2001 From: Carlos Panato Date: Sun, 7 Mar 2021 19:04:43 +0100 Subject: [PATCH 2/2] add sweep count metric --- cmd/aws-janitor/main.go | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/cmd/aws-janitor/main.go b/cmd/aws-janitor/main.go index 0a4c7df3..24d45c72 100644 --- a/cmd/aws-janitor/main.go +++ b/cmd/aws-janitor/main.go @@ -52,11 +52,18 @@ var ( excludeTags common.CommaSeparatedStrings includeTags common.CommaSeparatedStrings + sweepCount int + cleaningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "aws_janitor_job_duration_time_seconds", ConstLabels: prometheus.Labels{}, Buckets: prometheus.ExponentialBuckets(1, 1.4, 30), - }, []string{"type", "status"}) + }, []string{"type", "status", "region"}) + + sweepCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "aws_janitor_swept_resources", + ConstLabels: prometheus.Labels{}, + }, []string{"type", "status", "region"}) ) func init() { @@ -81,13 +88,12 @@ func main() { exitCode := 2 startProcess := time.Now() if *pushGateway != "" { - jobName := fmt.Sprintf("aws-janitor-%s", common.GenerateDynamicResourceName()) registry := prometheus.NewRegistry() - registry.MustRegister(cleaningTimeHistogram) - pusher := push.New(*pushGateway, jobName).Gatherer(registry) + registry.MustRegister(cleaningTimeHistogram, sweepCounter) + pusher := push.New(*pushGateway, "aws-janitor").Gatherer(registry) defer func() { - pushMetricBeforeExit(pusher, startProcess, *cleanAll, exitCode) + pushMetricBeforeExit(pusher, startProcess, exitCode) os.Exit(exitCode) }() } else { @@ -174,17 +180,17 @@ func markAndSweep(opts resources.Options, region string) error { } } - swept := res.MarkComplete() + sweepCount = res.MarkComplete() if err := res.Save(opts.Session, s3p); err != nil { return errors.Wrapf(err, "Error saving %q", *path) } - logrus.Infof("swept %d resources", swept) + logrus.Infof("swept %d resources", sweepCount) return nil } -func pushMetricBeforeExit(pusher *push.Pusher, startTime time.Time, jobType bool, exitCode int) { +func pushMetricBeforeExit(pusher *push.Pusher, startTime time.Time, exitCode int) { // Set the status of the job status := "failed" if exitCode == 0 { @@ -192,13 +198,21 @@ func pushMetricBeforeExit(pusher *push.Pusher, startTime time.Time, jobType bool } // Set the type of the job to report the metric - job := "sweep" - if jobType { - job = "clean" + var job string + if !*cleanAll { + job = "mark_and_sweep" + + sweepCounter. + With(prometheus.Labels{"type": job, "status": status, "region": *region}). + Add(float64(sweepCount)) + } else { + job = "clean_all" } duration := time.Since(startTime).Seconds() - cleaningTimeHistogram.WithLabelValues(job, status).Observe(duration) + cleaningTimeHistogram. + With(prometheus.Labels{"type": job, "status": status, "region": *region}). + Observe(duration) if err := pusher.Add(); err != nil { logrus.Errorf("Could not push to Pushgateway: %v", err)