Merge pull request #4 from faros-ai/master

Added UnHealthyHostCount metric + cleanup warnings
lorenzoaiello · Feb 1, 2022 · 4920339 · 4920339
2 parents 3bdbbc0 + fd10f77
commit 4920339
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -12,10 +12,11 @@ Alarms Always Created:
 - Any 5xx errors from the target group
 - Any 5xx errors from the load balancer
 - Unacceptably high average response times
+- Number of unhealthy hosts
 
-**Estimated Operating Cost**: $ 0.30 / month
+**Estimated Operating Cost**: $ 0.40 / month
 
-- $ 0.10 / month for Metric Alarms (3x)
+- $ 0.10 / month for Metric Alarms (4x)
 
 ## Example
 
@@ -37,6 +38,7 @@ module "aws-alb-alarms" {
 | load\_balancer\_id | ALB ID | `string` | n/a | yes |
 | prefix | Alarm Name Prefix | `string` | `""` | no |
 | response\_time\_threshold | The average number of milliseconds that requests should complete within. | `string` | `"50"` | no |
+| unhealthy\_hosts\_threshold | The number of unhealthy hosts. | `string` | `"0"` | no |
 | statistic\_period | The number of seconds that make each statistic period. | `string` | `"60"` | no |
 | target\_group\_id | Target Group ID | `string` | n/a | yes |
 

diff --git a/main.tf b/main.tf
@@ -44,7 +44,26 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time_average" {
   period              = var.statistic_period
   statistic           = "Average"
   threshold           = var.response_time_threshold
-  alarm_description   = "Average API response time is too high"
+  alarm_description   = format("Average API response time is greater than %s", var.response_time_threshold)
+  alarm_actions       = var.actions_alarm
+  ok_actions          = var.actions_ok
+
+  dimensions = {
+    "TargetGroup"  = var.target_group_id
+    "LoadBalancer" = var.load_balancer_id
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "unhealthy_hosts" {
+  alarm_name          = "${var.prefix}alb-tg-${var.target_group_id}-unhealthy-hosts"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = var.evaluation_period
+  metric_name         = "UnHealthyHostCount"
+  namespace           = "AWS/ApplicationELB"
+  period              = var.statistic_period
+  statistic           = "Minimum"
+  threshold           = var.unhealthy_hosts_threshold
+  alarm_description   = format("Unhealthy host count is greater than %s", var.unhealthy_hosts_threshold)
   alarm_actions       = var.actions_alarm
   ok_actions          = var.actions_ok
 

diff --git a/variables.tf b/variables.tf
@@ -1,45 +1,51 @@
 variable "load_balancer_id" {
-  type        = "string"
+  type        = string
   description = "ALB ID"
 }
 
 variable "target_group_id" {
-  type        = "string"
+  type        = string
   description = "Target Group ID"
 }
 
 variable "prefix" {
-  type        = "string"
+  type        = string
   default     = ""
   description = "Alarm Name Prefix"
 }
 
 variable "response_time_threshold" {
-  type        = "string"
+  type        = string
   default     = "50"
   description = "The average number of milliseconds that requests should complete within."
 }
 
+variable "unhealthy_hosts_threshold" {
+  type        = string
+  default     = "0"
+  description = "The number of unhealthy hosts."
+}
+
 variable "evaluation_period" {
-  type        = "string"
+  type        = string
   default     = "5"
   description = "The evaluation period over which to use when triggering alarms."
 }
 
 variable "statistic_period" {
-  type        = "string"
+  type        = string
   default     = "60"
   description = "The number of seconds that make each statistic period."
 }
 
 variable "actions_alarm" {
-  type        = "list"
+  type        = list(string)
   default     = []
   description = "A list of actions to take when alarms are triggered. Will likely be an SNS topic for event distribution."
 }
 
 variable "actions_ok" {
-  type        = "list"
+  type        = list(string)
   default     = []
   description = "A list of actions to take when alarms are cleared. Will likely be an SNS topic for event distribution."
 }