Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aws-janitor: add job duration metric #78

Merged
merged 2 commits into from
Mar 12, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 89 additions & 14 deletions cmd/aws-janitor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,19 @@ package main
import (
"flag"
"fmt"
"os"
"runtime"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/push"
"github.com/sirupsen/logrus"

"k8s.io/test-infra/prow/logrusutil"

"sigs.k8s.io/boskos/aws-janitor/account"
"sigs.k8s.io/boskos/aws-janitor/regions"
"sigs.k8s.io/boskos/aws-janitor/resources"
Expand All @@ -34,16 +40,30 @@ import (
)

var (
maxTTL = flag.Duration("ttl", 24*time.Hour, "Maximum time before attempting to delete a resource. Set to 0s to nuke all non-default resources.")
region = flag.String("region", "", "The region to clean (otherwise defaults to all regions)")
path = flag.String("path", "", "S3 path for mark data (required when -all=false)")
cleanAll = flag.Bool("all", false, "Clean all resources (ignores -path)")
logLevel = flag.String("log-level", "info", fmt.Sprintf("Log level is one of %v.", logrus.AllLevels))
dryRun = flag.Bool("dry-run", false, "If set, don't delete any resources, only log what would be done")
ttlTagKey = flag.String("ttl-tag-key", "", "If set, allow resources to use a tag with this key to override TTL")
maxTTL = flag.Duration("ttl", 24*time.Hour, "Maximum time before attempting to delete a resource. Set to 0s to nuke all non-default resources.")
region = flag.String("region", "", "The region to clean (otherwise defaults to all regions)")
path = flag.String("path", "", "S3 path for mark data (required when -all=false)")
cleanAll = flag.Bool("all", false, "Clean all resources (ignores -path)")
logLevel = flag.String("log-level", "info", fmt.Sprintf("Log level is one of %v.", logrus.AllLevels))
dryRun = flag.Bool("dry-run", false, "If set, don't delete any resources, only log what would be done")
ttlTagKey = flag.String("ttl-tag-key", "", "If set, allow resources to use a tag with this key to override TTL")
pushGateway = flag.String("push-gateway", "", "If specified, push prometheus metrics to this endpoint.")

excludeTags common.CommaSeparatedStrings
includeTags common.CommaSeparatedStrings

sweepCount int

cleaningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "aws_janitor_job_duration_time_seconds",
ConstLabels: prometheus.Labels{},
Buckets: prometheus.ExponentialBuckets(1, 1.4, 30),
}, []string{"type", "status", "region"})

sweepCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "aws_janitor_swept_resources",
ConstLabels: prometheus.Labels{},
}, []string{"type", "status", "region"})
)

func init() {
Expand All @@ -63,24 +83,46 @@ func main() {
}
logrus.SetLevel(level)

// If Prometheus PushGateway is configured, then before exit push the metric
// to the PushGateway instance, otherwise just exit
exitCode := 2
startProcess := time.Now()
if *pushGateway != "" {
registry := prometheus.NewRegistry()
registry.MustRegister(cleaningTimeHistogram, sweepCounter)
pusher := push.New(*pushGateway, "aws-janitor").Gatherer(registry)

defer func() {
pushMetricBeforeExit(pusher, startProcess, exitCode)
os.Exit(exitCode)
}()
} else {
defer func() {
os.Exit(exitCode)
}()
}

// Retry aggressively (with default back-off). If the account is
// in a really bad state, we may be contending with API rate
// limiting and fighting against the very resources we're trying
// to delete.
sess := session.Must(session.NewSessionWithOptions(session.Options{Config: aws.Config{MaxRetries: aws.Int(100)}}))
acct, err := account.GetAccount(sess, regions.Default)
if err != nil {
logrus.Fatalf("Failed retrieving account: %v", err)
logrus.Errorf("Failed retrieving account: %v", err)
runtime.Goexit()
}
logrus.Debugf("account: %s", acct)

excludeTM, err := resources.TagMatcherForTags(excludeTags)
if err != nil {
logrus.Fatalf("Error parsing --exclude-tags: %v", err)
logrus.Errorf("Error parsing --exclude-tags: %v", err)
runtime.Goexit()
}
includeTM, err := resources.TagMatcherForTags(includeTags)
if err != nil {
logrus.Fatalf("Error parsing --include-tags: %v", err)
logrus.Errorf("Error parsing --include-tags: %v", err)
runtime.Goexit()
}

opts := resources.Options{
Expand All @@ -94,11 +136,15 @@ func main() {

if *cleanAll {
if err := resources.CleanAll(opts, *region); err != nil {
logrus.Fatalf("Error cleaning all resources: %v", err)
logrus.Errorf("Error cleaning all resources: %v", err)
runtime.Goexit()
}
} else if err := markAndSweep(opts, *region); err != nil {
logrus.Fatalf("Error marking and sweeping resources: %v", err)
logrus.Errorf("Error marking and sweeping resources: %v", err)
runtime.Goexit()
}

exitCode = 0
}

func markAndSweep(opts resources.Options, region string) error {
Expand Down Expand Up @@ -134,12 +180,41 @@ func markAndSweep(opts resources.Options, region string) error {
}
}

swept := res.MarkComplete()
sweepCount = res.MarkComplete()
if err := res.Save(opts.Session, s3p); err != nil {
return errors.Wrapf(err, "Error saving %q", *path)
}

logrus.Infof("swept %d resources", swept)
logrus.Infof("swept %d resources", sweepCount)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no action required in this change, but in the future it might be worth reporting how many resources were deleted, too (see line directly above this one, I just can't comment on it directly)

(there's probably a similar count obtainable from CleanAll())

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for your review and feedback Jeff

Added a metric to report the sweep, the CleanAll() function does not return how many resources were deleted, but i think we can add that, i can do that if you think that will be good

ptal

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, we'd need to pass that information back from CleanAll(), but that can happen separately.

return nil
}

func pushMetricBeforeExit(pusher *push.Pusher, startTime time.Time, exitCode int) {
// Set the status of the job
status := "failed"
if exitCode == 0 {
status = "success"
}

// Set the type of the job to report the metric
var job string
if !*cleanAll {
job = "mark_and_sweep"

sweepCounter.
With(prometheus.Labels{"type": job, "status": status, "region": *region}).
Add(float64(sweepCount))
} else {
job = "clean_all"
}

duration := time.Since(startTime).Seconds()
cleaningTimeHistogram.
With(prometheus.Labels{"type": job, "status": status, "region": *region}).
Observe(duration)

if err := pusher.Add(); err != nil {
logrus.Errorf("Could not push to Pushgateway: %v", err)
}
}