From ae83b3aa63281af1d6642edef645d52e0e538809 Mon Sep 17 00:00:00 2001 From: GuyTempleton Date: Thu, 23 Sep 2021 13:15:49 +0100 Subject: [PATCH] Spot termination exporter - fetch rebalance recommendations --- README.md | 25 +++++++++------ metadata.go | 77 ++++++++++++++++++++++++++++++++++++++------- util/test_server.go | 10 +++++- 3 files changed, 90 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index dcb4948..2202de6 100644 --- a/README.md +++ b/README.md @@ -11,22 +11,26 @@ Prometheus [exporters](https://prometheus.io/docs/instrumenting/writing_exporter ### Spot instance termination notice -The Termination Notice is accessible to code running on the instance via the instance’s metadata at `http://169.254.169.254/latest/meta-data/spot/termination-time`. This field becomes available when the instance has been marked for termination and will contain the time when a shutdown signal will be sent to the instance’s operating system. -At that time, the Spot Instance Request’s bid status will be set to `marked-for-termination.` +The Termination Notice is accessible to code running on the instance via the instance’s metadata at `http://169.254.169.254/latest/meta-data/spot/termination-time`. This field becomes available when the instance has been marked for termination and will contain the time when a shutdown signal will be sent to the instance’s operating system. +At that time, the Spot Instance Request’s bid status will be set to `marked-for-termination.` The bid status is accessible via the `DescribeSpotInstanceRequests` API for use by programs that manage Spot bids and instances. +### Spot instance rebalance recommendations + +[Rebalance recommendations](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/rebalance-recommendations.html) are advance notice that a given spot instance is at elevated risk of spot disruption, they can either be accessed via AWS EventBridge or via the instance metadata endpoint. A number of AWS tools automatically handle rebalance recommendations, for instance [EKS managed node groups](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html#managed-node-group-capacity-types). + ### Quick start The project uses the [promu](https://github.com/prometheus/promu) Prometheus utility tool. To build the exporter `promu` needs to be installed. To install promu and build the exporter: -``` +```bash go get github.com/prometheus/promu promu build ``` The following options can be configured when starting the exporter: -``` +```bash ./spot-termination-exporter --help Usage of ./spot-termintation-exporter: -bind-addr string @@ -42,19 +46,22 @@ Usage of ./spot-termintation-exporter: ### Test locally -The AWS instance metadata is available at `http://169.254.169.254/latest/meta-data/`. By default this is the endpoint that is being queried by the exporter but it is quite hard to reproduce a termination notice on an AWS instance for testing, so the meta-data endpoint can be changed in the configuration. -There is a test server in the `utils` directory that can be used to mock the behavior of the metadata endpoint. It listens on port 9092 and provides dummy responses for `/instance-id` and `/spot/instance-action`. It can be started with: -``` +The AWS instance metadata is available at `http://169.254.169.254/latest/meta-data/`. By default this is the endpoint that is being queried by the exporter but it is quite hard to reproduce a termination notice or rebalance recommendation on an AWS instance for testing, so the meta-data endpoint can be changed in the configuration. +There is a test server in the `utils` directory that can be used to mock the behavior of the metadata endpoint. It listens on port 9092 and provides dummy responses for `/instance-id`, `/spot/instance-action`, `instance-type`, and `events/recommendations/rebalance`. It can be started with: + +```bash go run util/test_server.go ``` + The exporter can be started with this configuration to query this endpoint locally: -``` + +```bash ./spot-termination-exporter --metadata-endpoint http://localhost:9092/latest/meta-data/ --log-level debug ``` ### Metrics -``` +```text # HELP aws_instance_metadata_service_available Metadata service available # TYPE aws_instance_metadata_service_available gauge aws_instance_metadata_service_available{instance_id="i-0d2aab13057917887"} 1 diff --git a/metadata.go b/metadata.go index 0c05870..4fd9777 100644 --- a/metadata.go +++ b/metadata.go @@ -10,10 +10,12 @@ import ( ) type terminationCollector struct { - metadataEndpoint string - scrapeSuccessful *prometheus.Desc - terminationIndicator *prometheus.Desc - terminationTime *prometheus.Desc + metadataEndpoint string + rebalanceIndicator *prometheus.Desc + rebalanceScrapeSuccessful *prometheus.Desc + scrapeSuccessful *prometheus.Desc + terminationIndicator *prometheus.Desc + terminationTime *prometheus.Desc } type InstanceAction struct { @@ -21,16 +23,24 @@ type InstanceAction struct { Time time.Time `json:"time"` } +type InstanceEvent struct { + NoticeTime time.Time `json:"noticeTime"` +} + func NewTerminationCollector(me string) *terminationCollector { return &terminationCollector{ - metadataEndpoint: me, - scrapeSuccessful: prometheus.NewDesc("aws_instance_metadata_service_available", "Metadata service available", []string{"instance_id"}, nil), - terminationIndicator: prometheus.NewDesc("aws_instance_termination_imminent", "Instance is about to be terminated", []string{"instance_action", "instance_id"}, nil), - terminationTime: prometheus.NewDesc("aws_instance_termination_in", "Instance will be terminated in", []string{"instance_id"}, nil), + metadataEndpoint: me, + rebalanceIndicator: prometheus.NewDesc("aws_instance_rebalance_recommended", "Instance rebalance is recommended", []string{"instance_id", "instance_type"}, nil), + rebalanceScrapeSuccessful: prometheus.NewDesc("aws_instance_metadata_service_events_available", "Metadata service events endpoint available", []string{"instance_id"}, nil), + scrapeSuccessful: prometheus.NewDesc("aws_instance_metadata_service_available", "Metadata service available", []string{"instance_id"}, nil), + terminationIndicator: prometheus.NewDesc("aws_instance_termination_imminent", "Instance is about to be terminated", []string{"instance_action", "instance_id", "instance_type"}, nil), + terminationTime: prometheus.NewDesc("aws_instance_termination_in", "Instance will be terminated in", []string{"instance_id", "instance_type"}, nil), } } func (c *terminationCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.rebalanceIndicator + ch <- c.rebalanceScrapeSuccessful ch <- c.scrapeSuccessful ch <- c.terminationIndicator ch <- c.terminationTime @@ -57,6 +67,20 @@ func (c *terminationCollector) Collect(ch chan<- prometheus.Metric) { body, _ := ioutil.ReadAll(idResp.Body) instanceId = string(body) + typeResp, err := client.Get(c.metadataEndpoint + "instance-type") + var instanceType string + if err != nil { + log.Errorf("couldn't parse instance-type from metadata: %s", err.Error()) + return + } + if typeResp.StatusCode == 404 { + log.Errorf("couldn't parse instance-type from metadata: endpoint not found") + return + } + defer typeResp.Body.Close() + body, _ = ioutil.ReadAll(typeResp.Body) + instanceType = string(body) + resp, err := client.Get(c.metadataEndpoint + "spot/instance-action") if err != nil { log.Errorf("Failed to fetch data from metadata service: %s", err) @@ -67,7 +91,7 @@ func (c *terminationCollector) Collect(ch chan<- prometheus.Metric) { if resp.StatusCode == 404 { log.Debug("instance-action endpoint not found") - ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, "", instanceId) + ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, "", instanceId, instanceType) return } else { defer resp.Body.Close() @@ -80,15 +104,44 @@ func (c *terminationCollector) Collect(ch chan<- prometheus.Metric) { // so parse error is not fatal if err != nil { log.Errorf("Couldn't parse instance-action metadata: %s", err) - ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, instanceId) + ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, instanceId, instanceType) } else { log.Infof("instance-action endpoint available, termination time: %v", ia.Time) - ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 1, ia.Action, instanceId) + ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 1, ia.Action, instanceId, instanceType) delta := ia.Time.Sub(time.Now()) if delta.Seconds() > 0 { - ch <- prometheus.MustNewConstMetric(c.terminationTime, prometheus.GaugeValue, delta.Seconds(), instanceId) + ch <- prometheus.MustNewConstMetric(c.terminationTime, prometheus.GaugeValue, delta.Seconds(), instanceId, instanceType) } } } } + + eventResp, err := client.Get(c.metadataEndpoint + "events/recommendations/rebalance") + if err != nil { + log.Errorf("Failed to fetch events data from metadata service: %s", err) + ch <- prometheus.MustNewConstMetric(c.rebalanceScrapeSuccessful, prometheus.GaugeValue, 0, instanceId) + return + } else { + ch <- prometheus.MustNewConstMetric(c.rebalanceScrapeSuccessful, prometheus.GaugeValue, 1, instanceId) + + if eventResp.StatusCode == 404 { + log.Debug("rebalance endpoint not found") + ch <- prometheus.MustNewConstMetric(c.rebalanceIndicator, prometheus.GaugeValue, 0, instanceId, instanceType) + return + } else { + defer eventResp.Body.Close() + body, _ := ioutil.ReadAll(eventResp.Body) + + var ie = InstanceEvent{} + err := json.Unmarshal(body, &ie) + + if err != nil { + log.Errorf("Couldn't parse rebalance recommendation event metadata: %s", err) + ch <- prometheus.MustNewConstMetric(c.rebalanceIndicator, prometheus.GaugeValue, 0, instanceId, instanceType) + } else { + log.Infof("rebalance recommendation event endpoint available, recommendation time: %v", ie.NoticeTime) + ch <- prometheus.MustNewConstMetric(c.rebalanceIndicator, prometheus.GaugeValue, 1, instanceId, instanceType) + } + } + } } diff --git a/util/test_server.go b/util/test_server.go index 5395283..166a89c 100644 --- a/util/test_server.go +++ b/util/test_server.go @@ -8,7 +8,7 @@ import ( ) // use this minimal http server to test the exporter locally -// change constant to have metadataEndpoint = "http://localhost:9092/latest/meta-data/spot/instance-action" +// Run the exporter with the flag --metadata-endpoint = "http://localhost:9092/latest/meta-data/" func main() { http.HandleFunc("/latest/meta-data/spot/instance-action", func(w http.ResponseWriter, r *http.Request) { terminationTime := time.Now().Add(2 * time.Minute) @@ -19,6 +19,14 @@ func main() { http.HandleFunc("/latest/meta-data/instance-id", func(w http.ResponseWriter, r *http.Request) { fmt.Fprint(w, "i-0d2aab13057917887") }) + http.HandleFunc("/latest/meta-data/instance-type", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprint(w, "c5.9xlarge") + }) + http.HandleFunc("/latest/meta-data/events/recommendations/rebalance", func(w http.ResponseWriter, r *http.Request) { + noticeTime := time.Now() + utc, _ := time.LoadLocation("UTC") + fmt.Fprintf(w, "{\"noticeTime\":\"%s\"}", noticeTime.In(utc).Format(time.RFC3339)) + }) log.Fatal(http.ListenAndServe(":9092", nil))