File tree Expand file tree Collapse file tree 19 files changed +1133
-18
lines changed
alert-policies/apache-mesos
observability-as-code/terraform Expand file tree Collapse file tree 19 files changed +1133
-18
lines changed Original file line number Diff line number Diff line change
1
+ # Name of the alert
2
+ name : Error Tasks
3
+
4
+ # Description and details
5
+ description : |+
6
+ This alert is triggered when the number of error tasks exceeds 3 for 5 minutes.
7
+ # Type of alert
8
+ type : STATIC
9
+
10
+ # NRQL query
11
+ nrql :
12
+
13
+ query : " FROM apacheMesos SELECT latest(`master/tasks_error`) as 'Tasks error'"
14
+
15
+ # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
16
+ valueFunction : SINGLE_VALUE
17
+
18
+ # List of Critical and Warning thresholds for the condition
19
+ terms :
20
+ - priority : CRITICAL
21
+ # Operator used to compare against the threshold.
22
+ operator : ABOVE
23
+ # Value that triggers a violation
24
+ threshold : 3
25
+ # Time in seconds; 120 - 3600
26
+ thresholdDuration : 300
27
+ # How many data points must be in violation for the duration
28
+ thresholdOccurrences : ALL
29
+ - priority : WARNING
30
+ # Operator used to compare against the threshold.
31
+ operator : ABOVE
32
+ # Value that triggers a violation
33
+ threshold : 1
34
+ # Time in seconds; 120 - 3600
35
+ thresholdDuration : 300
36
+ # How many data points must be in violation for the duration
37
+ thresholdOccurrences : ALL
38
+
39
+ # Duration after which a violation automatically closes
40
+ # Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
41
+ violationTimeLimitSeconds : 86400
Original file line number Diff line number Diff line change
1
+ # Name of the alert
2
+ name : Failed Tasks
3
+
4
+ # Description and details
5
+ description : |+
6
+ This alert is triggered when the number of failed tasks exceeds 3 for 5 minutes.
7
+ # Type of alert
8
+ type : STATIC
9
+
10
+ # NRQL query
11
+ nrql :
12
+
13
+ query : " FROM apacheMesos SELECT latest(`master/tasks_failed`) as 'Failed tasks'"
14
+
15
+ # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
16
+ valueFunction : SINGLE_VALUE
17
+
18
+ # List of Critical and Warning thresholds for the condition
19
+ terms :
20
+ - priority : CRITICAL
21
+ # Operator used to compare against the threshold.
22
+ operator : ABOVE
23
+ # Value that triggers a violation
24
+ threshold : 3
25
+ # Time in seconds; 120 - 3600
26
+ thresholdDuration : 300
27
+ # How many data points must be in violation for the duration
28
+ thresholdOccurrences : ALL
29
+ - priority : WARNING
30
+ # Operator used to compare against the threshold.
31
+ operator : ABOVE
32
+ # Value that triggers a violation
33
+ threshold : 1
34
+ # Time in seconds; 120 - 3600
35
+ thresholdDuration : 300
36
+ # How many data points must be in violation for the duration
37
+ thresholdOccurrences : ALL
38
+
39
+ # Duration after which a violation automatically closes
40
+ # Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
41
+ violationTimeLimitSeconds : 86400
Original file line number Diff line number Diff line change
1
+ # Name of the alert
2
+ name : Lost Tasks
3
+
4
+ # Description and details
5
+ description : |+
6
+ This alert is triggered when the number of lost tasks exceeds 3 for 5 minutes.
7
+ # Type of alert
8
+ type : STATIC
9
+
10
+ # NRQL query
11
+ nrql :
12
+
13
+ query : " FROM apacheMesos SELECT latest(`master/tasks_lost`) as 'Tasks lost'"
14
+
15
+ # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
16
+ valueFunction : SINGLE_VALUE
17
+
18
+ # List of Critical and Warning thresholds for the condition
19
+ terms :
20
+ - priority : CRITICAL
21
+ # Operator used to compare against the threshold.
22
+ operator : ABOVE
23
+ # Value that triggers a violation
24
+ threshold : 3
25
+ # Time in seconds; 120 - 3600
26
+ thresholdDuration : 300
27
+ # How many data points must be in violation for the duration
28
+ thresholdOccurrences : ALL
29
+ - priority : WARNING
30
+ # Operator used to compare against the threshold.
31
+ operator : ABOVE
32
+ # Value that triggers a violation
33
+ threshold : 1
34
+ # Time in seconds; 120 - 3600
35
+ thresholdDuration : 300
36
+ # How many data points must be in violation for the duration
37
+ thresholdOccurrences : ALL
38
+
39
+ # Duration after which a violation automatically closes
40
+ # Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
41
+ violationTimeLimitSeconds : 86400
You can’t perform that action at this time.
0 commit comments