Skip to content

Commit

Permalink
Merge pull request #78 from cpanato/GH-13-aws-janitor
Browse files Browse the repository at this point in the history
aws-janitor: add job duration metric
  • Loading branch information
k8s-ci-robot authored Mar 12, 2021
2 parents 32671f8 + 7553aa1 commit 885e35e
Showing 1 changed file with 89 additions and 14 deletions.
103 changes: 89 additions & 14 deletions cmd/aws-janitor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,19 @@ package main
import (
"flag"
"fmt"
"os"
"runtime"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/push"
"github.com/sirupsen/logrus"

"k8s.io/test-infra/prow/logrusutil"

"sigs.k8s.io/boskos/aws-janitor/account"
"sigs.k8s.io/boskos/aws-janitor/regions"
"sigs.k8s.io/boskos/aws-janitor/resources"
Expand All @@ -34,16 +40,30 @@ import (
)

var (
maxTTL = flag.Duration("ttl", 24*time.Hour, "Maximum time before attempting to delete a resource. Set to 0s to nuke all non-default resources.")
region = flag.String("region", "", "The region to clean (otherwise defaults to all regions)")
path = flag.String("path", "", "S3 path for mark data (required when -all=false)")
cleanAll = flag.Bool("all", false, "Clean all resources (ignores -path)")
logLevel = flag.String("log-level", "info", fmt.Sprintf("Log level is one of %v.", logrus.AllLevels))
dryRun = flag.Bool("dry-run", false, "If set, don't delete any resources, only log what would be done")
ttlTagKey = flag.String("ttl-tag-key", "", "If set, allow resources to use a tag with this key to override TTL")
maxTTL = flag.Duration("ttl", 24*time.Hour, "Maximum time before attempting to delete a resource. Set to 0s to nuke all non-default resources.")
region = flag.String("region", "", "The region to clean (otherwise defaults to all regions)")
path = flag.String("path", "", "S3 path for mark data (required when -all=false)")
cleanAll = flag.Bool("all", false, "Clean all resources (ignores -path)")
logLevel = flag.String("log-level", "info", fmt.Sprintf("Log level is one of %v.", logrus.AllLevels))
dryRun = flag.Bool("dry-run", false, "If set, don't delete any resources, only log what would be done")
ttlTagKey = flag.String("ttl-tag-key", "", "If set, allow resources to use a tag with this key to override TTL")
pushGateway = flag.String("push-gateway", "", "If specified, push prometheus metrics to this endpoint.")

excludeTags common.CommaSeparatedStrings
includeTags common.CommaSeparatedStrings

sweepCount int

cleaningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "aws_janitor_job_duration_time_seconds",
ConstLabels: prometheus.Labels{},
Buckets: prometheus.ExponentialBuckets(1, 1.4, 30),
}, []string{"type", "status", "region"})

sweepCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "aws_janitor_swept_resources",
ConstLabels: prometheus.Labels{},
}, []string{"type", "status", "region"})
)

func init() {
Expand All @@ -63,24 +83,46 @@ func main() {
}
logrus.SetLevel(level)

// If Prometheus PushGateway is configured, then before exit push the metric
// to the PushGateway instance, otherwise just exit
exitCode := 2
startProcess := time.Now()
if *pushGateway != "" {
registry := prometheus.NewRegistry()
registry.MustRegister(cleaningTimeHistogram, sweepCounter)
pusher := push.New(*pushGateway, "aws-janitor").Gatherer(registry)

defer func() {
pushMetricBeforeExit(pusher, startProcess, exitCode)
os.Exit(exitCode)
}()
} else {
defer func() {
os.Exit(exitCode)
}()
}

// Retry aggressively (with default back-off). If the account is
// in a really bad state, we may be contending with API rate
// limiting and fighting against the very resources we're trying
// to delete.
sess := session.Must(session.NewSessionWithOptions(session.Options{Config: aws.Config{MaxRetries: aws.Int(100)}}))
acct, err := account.GetAccount(sess, regions.Default)
if err != nil {
logrus.Fatalf("Failed retrieving account: %v", err)
logrus.Errorf("Failed retrieving account: %v", err)
runtime.Goexit()
}
logrus.Debugf("account: %s", acct)

excludeTM, err := resources.TagMatcherForTags(excludeTags)
if err != nil {
logrus.Fatalf("Error parsing --exclude-tags: %v", err)
logrus.Errorf("Error parsing --exclude-tags: %v", err)
runtime.Goexit()
}
includeTM, err := resources.TagMatcherForTags(includeTags)
if err != nil {
logrus.Fatalf("Error parsing --include-tags: %v", err)
logrus.Errorf("Error parsing --include-tags: %v", err)
runtime.Goexit()
}

opts := resources.Options{
Expand All @@ -94,11 +136,15 @@ func main() {

if *cleanAll {
if err := resources.CleanAll(opts, *region); err != nil {
logrus.Fatalf("Error cleaning all resources: %v", err)
logrus.Errorf("Error cleaning all resources: %v", err)
runtime.Goexit()
}
} else if err := markAndSweep(opts, *region); err != nil {
logrus.Fatalf("Error marking and sweeping resources: %v", err)
logrus.Errorf("Error marking and sweeping resources: %v", err)
runtime.Goexit()
}

exitCode = 0
}

func markAndSweep(opts resources.Options, region string) error {
Expand Down Expand Up @@ -134,12 +180,41 @@ func markAndSweep(opts resources.Options, region string) error {
}
}

swept := res.MarkComplete()
sweepCount = res.MarkComplete()
if err := res.Save(opts.Session, s3p); err != nil {
return errors.Wrapf(err, "Error saving %q", *path)
}

logrus.Infof("swept %d resources", swept)
logrus.Infof("swept %d resources", sweepCount)

return nil
}

func pushMetricBeforeExit(pusher *push.Pusher, startTime time.Time, exitCode int) {
// Set the status of the job
status := "failed"
if exitCode == 0 {
status = "success"
}

// Set the type of the job to report the metric
var job string
if !*cleanAll {
job = "mark_and_sweep"

sweepCounter.
With(prometheus.Labels{"type": job, "status": status, "region": *region}).
Add(float64(sweepCount))
} else {
job = "clean_all"
}

duration := time.Since(startTime).Seconds()
cleaningTimeHistogram.
With(prometheus.Labels{"type": job, "status": status, "region": *region}).
Observe(duration)

if err := pusher.Add(); err != nil {
logrus.Errorf("Could not push to Pushgateway: %v", err)
}
}

0 comments on commit 885e35e

Please sign in to comment.