-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtests.yaml
104 lines (103 loc) · 5.09 KB
/
tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# yamllint disable rule:line-length
---
rule_files:
- prometheus_alerts.yaml
tests:
# Karpenter
- interval: 1m
input_series:
- series: 'karpenter_cloudprovider_errors_total{namespace="karpenter", job="karpenter", provider="aws", controller="node.termination", method="Get"}'
values: "1+1x20"
alert_rule_test:
- eval_time: 20m
alertname: KarpenterCloudProviderErrors
exp_alerts:
- exp_labels:
namespace: karpenter
job: karpenter
provider: aws
controller: node.termination
method: Get
severity: warning
exp_annotations:
summary: "Karpenter has Cloud Provider Errors."
description: "The Karpenter provider aws with the controller node.termination has errors with the method Get."
dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance"
- interval: 1m
input_series:
- series: 'karpenter_nodepools_usage{namespace="karpenter", job="karpenter", nodepool="nodepool-a", resource_type="cpu"}'
values: "80x15"
- series: 'karpenter_nodepools_limit{namespace="karpenter", job="karpenter", nodepool="nodepool-a", resource_type="cpu"}'
values: "100x15"
alert_rule_test:
- eval_time: 15m
alertname: KarpenterNodepoolNearCapacity
exp_alerts:
- exp_labels:
namespace: karpenter
job: karpenter
nodepool: nodepool-a
resource_type: cpu
severity: warning
exp_annotations:
summary: "Karpenter Nodepool near capacity."
description: "The resource cpu in the Karpenter node pool nodepool-a is nearing its limit. Consider scaling or adding resources."
dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview"
# Cluster Autoscaler
- interval: 1m
input_series:
- series: 'cluster_autoscaler_nodes_count{namespace="autoscaler", job="cluster-autoscaler"}'
values: "95x15"
- series: 'cluster_autoscaler_max_nodes_count{namespace="autoscaler", job="cluster-autoscaler"}'
values: "100x15"
alert_rule_test:
- eval_time: 15m
alertname: ClusterAutoscalerNodeCountNearCapacity
exp_alerts:
- exp_labels:
namespace: autoscaler
job: cluster-autoscaler
severity: warning
exp_annotations:
summary: "Cluster Autoscaler Node Count near Capacity."
description: "The node count for the cluster autoscaler job cluster-autoscaler is reaching max limit. Consider scaling node groups."
dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"
- interval: 1m
input_series:
- series: karpenter_nodeclaims_termination_duration_seconds_sum{namespace="karpenter", job="karpenter", nodepool="nodepool-a"}
values: "40000x20"
- series: karpenter_nodeclaims_termination_duration_seconds_count{namespace="karpenter", job="karpenter", nodepool="nodepool-a"}
values: "20x20"
- series: karpenter_nodeclaims_termination_duration_seconds_sum{namespace="karpenter", job="karpenter", nodepool="nodepool-b"}
values: "1x20"
- series: karpenter_nodeclaims_termination_duration_seconds_count{namespace="karpenter", job="karpenter", nodepool="nodepool-b"}
values: "1x20"
alert_rule_test:
- eval_time: 20m
alertname: KarpenterNodeClaimsTerminationDurationHigh
exp_alerts:
- exp_labels:
namespace: karpenter
job: karpenter
nodepool: nodepool-a
severity: warning
exp_annotations:
summary: "Karpenter Node Claims Termination Duration is High."
description: "The average node claim termination duration in Karpenter has exceeded 20 minutes for more than 15 minutes in nodepool nodepool-a. This may indicate cloud provider issues or improper instance termination handling."
dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kact-jkwq/kubernetes-autoscaling-karpenter-activity"
- interval: 1m
input_series:
- series: 'cluster_autoscaler_unschedulable_pods_count{namespace="autoscaler", job="cluster-autoscaler"}'
values: "1x15"
alert_rule_test:
- eval_time: 15m
alertname: ClusterAutoscalerUnschedulablePods
exp_alerts:
- exp_labels:
namespace: autoscaler
job: cluster-autoscaler
severity: warning
exp_annotations:
summary: "Pods Pending Scheduling - Cluster Node Group Scaling Required"
description: "The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity."
dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"