Skip to content

Commit

Permalink
Merge pull request #1 from gjtempleton/Monitor-Rebalance-Recommendations
Browse files Browse the repository at this point in the history
Spot termination exporter - fetch rebalance recommendations
  • Loading branch information
gjtempleton authored Sep 27, 2021
2 parents a438b7c + ae83b3a commit df881ba
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 22 deletions.
25 changes: 16 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,26 @@ Prometheus [exporters](https://prometheus.io/docs/instrumenting/writing_exporter

### Spot instance termination notice

The Termination Notice is accessible to code running on the instance via the instance’s metadata at `http://169.254.169.254/latest/meta-data/spot/termination-time`. This field becomes available when the instance has been marked for termination and will contain the time when a shutdown signal will be sent to the instance’s operating system.
At that time, the Spot Instance Request’s bid status will be set to `marked-for-termination.`
The Termination Notice is accessible to code running on the instance via the instance’s metadata at `http://169.254.169.254/latest/meta-data/spot/termination-time`. This field becomes available when the instance has been marked for termination and will contain the time when a shutdown signal will be sent to the instance’s operating system.
At that time, the Spot Instance Request’s bid status will be set to `marked-for-termination.`
The bid status is accessible via the `DescribeSpotInstanceRequests` API for use by programs that manage Spot bids and instances.

### Spot instance rebalance recommendations

[Rebalance recommendations](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/rebalance-recommendations.html) are advance notice that a given spot instance is at elevated risk of spot disruption, they can either be accessed via AWS EventBridge or via the instance metadata endpoint. A number of AWS tools automatically handle rebalance recommendations, for instance [EKS managed node groups](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html#managed-node-group-capacity-types).

### Quick start

The project uses the [promu](https://github.com/prometheus/promu) Prometheus utility tool. To build the exporter `promu` needs to be installed. To install promu and build the exporter:

```
```bash
go get github.com/prometheus/promu
promu build
```

The following options can be configured when starting the exporter:

```
```bash
./spot-termination-exporter --help
Usage of ./spot-termintation-exporter:
-bind-addr string
Expand All @@ -42,19 +46,22 @@ Usage of ./spot-termintation-exporter:

### Test locally

The AWS instance metadata is available at `http://169.254.169.254/latest/meta-data/`. By default this is the endpoint that is being queried by the exporter but it is quite hard to reproduce a termination notice on an AWS instance for testing, so the meta-data endpoint can be changed in the configuration.
There is a test server in the `utils` directory that can be used to mock the behavior of the metadata endpoint. It listens on port 9092 and provides dummy responses for `/instance-id` and `/spot/instance-action`. It can be started with:
```
The AWS instance metadata is available at `http://169.254.169.254/latest/meta-data/`. By default this is the endpoint that is being queried by the exporter but it is quite hard to reproduce a termination notice or rebalance recommendation on an AWS instance for testing, so the meta-data endpoint can be changed in the configuration.
There is a test server in the `utils` directory that can be used to mock the behavior of the metadata endpoint. It listens on port 9092 and provides dummy responses for `/instance-id`, `/spot/instance-action`, `instance-type`, and `events/recommendations/rebalance`. It can be started with:

```bash
go run util/test_server.go
```

The exporter can be started with this configuration to query this endpoint locally:
```

```bash
./spot-termination-exporter --metadata-endpoint http://localhost:9092/latest/meta-data/ --log-level debug
```

### Metrics

```
```text
# HELP aws_instance_metadata_service_available Metadata service available
# TYPE aws_instance_metadata_service_available gauge
aws_instance_metadata_service_available{instance_id="i-0d2aab13057917887"} 1
Expand Down
77 changes: 65 additions & 12 deletions metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,37 @@ import (
)

type terminationCollector struct {
metadataEndpoint string
scrapeSuccessful *prometheus.Desc
terminationIndicator *prometheus.Desc
terminationTime *prometheus.Desc
metadataEndpoint string
rebalanceIndicator *prometheus.Desc
rebalanceScrapeSuccessful *prometheus.Desc
scrapeSuccessful *prometheus.Desc
terminationIndicator *prometheus.Desc
terminationTime *prometheus.Desc
}

type InstanceAction struct {
Action string `json:"action"`
Time time.Time `json:"time"`
}

type InstanceEvent struct {
NoticeTime time.Time `json:"noticeTime"`
}

func NewTerminationCollector(me string) *terminationCollector {
return &terminationCollector{
metadataEndpoint: me,
scrapeSuccessful: prometheus.NewDesc("aws_instance_metadata_service_available", "Metadata service available", []string{"instance_id"}, nil),
terminationIndicator: prometheus.NewDesc("aws_instance_termination_imminent", "Instance is about to be terminated", []string{"instance_action", "instance_id"}, nil),
terminationTime: prometheus.NewDesc("aws_instance_termination_in", "Instance will be terminated in", []string{"instance_id"}, nil),
metadataEndpoint: me,
rebalanceIndicator: prometheus.NewDesc("aws_instance_rebalance_recommended", "Instance rebalance is recommended", []string{"instance_id", "instance_type"}, nil),
rebalanceScrapeSuccessful: prometheus.NewDesc("aws_instance_metadata_service_events_available", "Metadata service events endpoint available", []string{"instance_id"}, nil),
scrapeSuccessful: prometheus.NewDesc("aws_instance_metadata_service_available", "Metadata service available", []string{"instance_id"}, nil),
terminationIndicator: prometheus.NewDesc("aws_instance_termination_imminent", "Instance is about to be terminated", []string{"instance_action", "instance_id", "instance_type"}, nil),
terminationTime: prometheus.NewDesc("aws_instance_termination_in", "Instance will be terminated in", []string{"instance_id", "instance_type"}, nil),
}
}

func (c *terminationCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.rebalanceIndicator
ch <- c.rebalanceScrapeSuccessful
ch <- c.scrapeSuccessful
ch <- c.terminationIndicator
ch <- c.terminationTime
Expand All @@ -57,6 +67,20 @@ func (c *terminationCollector) Collect(ch chan<- prometheus.Metric) {
body, _ := ioutil.ReadAll(idResp.Body)
instanceId = string(body)

typeResp, err := client.Get(c.metadataEndpoint + "instance-type")
var instanceType string
if err != nil {
log.Errorf("couldn't parse instance-type from metadata: %s", err.Error())
return
}
if typeResp.StatusCode == 404 {
log.Errorf("couldn't parse instance-type from metadata: endpoint not found")
return
}
defer typeResp.Body.Close()
body, _ = ioutil.ReadAll(typeResp.Body)
instanceType = string(body)

resp, err := client.Get(c.metadataEndpoint + "spot/instance-action")
if err != nil {
log.Errorf("Failed to fetch data from metadata service: %s", err)
Expand All @@ -67,7 +91,7 @@ func (c *terminationCollector) Collect(ch chan<- prometheus.Metric) {

if resp.StatusCode == 404 {
log.Debug("instance-action endpoint not found")
ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, "", instanceId)
ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, "", instanceId, instanceType)
return
} else {
defer resp.Body.Close()
Expand All @@ -80,15 +104,44 @@ func (c *terminationCollector) Collect(ch chan<- prometheus.Metric) {
// so parse error is not fatal
if err != nil {
log.Errorf("Couldn't parse instance-action metadata: %s", err)
ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, instanceId)
ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 0, instanceId, instanceType)
} else {
log.Infof("instance-action endpoint available, termination time: %v", ia.Time)
ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 1, ia.Action, instanceId)
ch <- prometheus.MustNewConstMetric(c.terminationIndicator, prometheus.GaugeValue, 1, ia.Action, instanceId, instanceType)
delta := ia.Time.Sub(time.Now())
if delta.Seconds() > 0 {
ch <- prometheus.MustNewConstMetric(c.terminationTime, prometheus.GaugeValue, delta.Seconds(), instanceId)
ch <- prometheus.MustNewConstMetric(c.terminationTime, prometheus.GaugeValue, delta.Seconds(), instanceId, instanceType)
}
}
}
}

eventResp, err := client.Get(c.metadataEndpoint + "events/recommendations/rebalance")
if err != nil {
log.Errorf("Failed to fetch events data from metadata service: %s", err)
ch <- prometheus.MustNewConstMetric(c.rebalanceScrapeSuccessful, prometheus.GaugeValue, 0, instanceId)
return
} else {
ch <- prometheus.MustNewConstMetric(c.rebalanceScrapeSuccessful, prometheus.GaugeValue, 1, instanceId)

if eventResp.StatusCode == 404 {
log.Debug("rebalance endpoint not found")
ch <- prometheus.MustNewConstMetric(c.rebalanceIndicator, prometheus.GaugeValue, 0, instanceId, instanceType)
return
} else {
defer eventResp.Body.Close()
body, _ := ioutil.ReadAll(eventResp.Body)

var ie = InstanceEvent{}
err := json.Unmarshal(body, &ie)

if err != nil {
log.Errorf("Couldn't parse rebalance recommendation event metadata: %s", err)
ch <- prometheus.MustNewConstMetric(c.rebalanceIndicator, prometheus.GaugeValue, 0, instanceId, instanceType)
} else {
log.Infof("rebalance recommendation event endpoint available, recommendation time: %v", ie.NoticeTime)
ch <- prometheus.MustNewConstMetric(c.rebalanceIndicator, prometheus.GaugeValue, 1, instanceId, instanceType)
}
}
}
}
10 changes: 9 additions & 1 deletion util/test_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
)

// use this minimal http server to test the exporter locally
// change constant to have metadataEndpoint = "http://localhost:9092/latest/meta-data/spot/instance-action"
// Run the exporter with the flag --metadata-endpoint = "http://localhost:9092/latest/meta-data/"
func main() {
http.HandleFunc("/latest/meta-data/spot/instance-action", func(w http.ResponseWriter, r *http.Request) {
terminationTime := time.Now().Add(2 * time.Minute)
Expand All @@ -19,6 +19,14 @@ func main() {
http.HandleFunc("/latest/meta-data/instance-id", func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "i-0d2aab13057917887")
})
http.HandleFunc("/latest/meta-data/instance-type", func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "c5.9xlarge")
})
http.HandleFunc("/latest/meta-data/events/recommendations/rebalance", func(w http.ResponseWriter, r *http.Request) {
noticeTime := time.Now()
utc, _ := time.LoadLocation("UTC")
fmt.Fprintf(w, "{\"noticeTime\":\"%s\"}", noticeTime.In(utc).Format(time.RFC3339))
})

log.Fatal(http.ListenAndServe(":9092", nil))

Expand Down

0 comments on commit df881ba

Please sign in to comment.