-
Notifications
You must be signed in to change notification settings - Fork 108
/
Copy pathalert.rules
154 lines (138 loc) · 6.06 KB
/
alert.rules
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
groups:
- name: db-alerts
rules:
- alert: DB_POSTGRES_PRIMARY_DOWN
expr: avg_over_time(pg_up{instance=~"(.*primary.*pg)"}[2m]) < 0.3
for: 2m
labels:
team: DBA
urgency: P0
annotations:
summary: "{{$labels.instance}} DB Postgres Primary Down: {{ $value }}"
description: "{{$labels.instance}} DB Postgres Primary Down: {{ $value }}"
- alert: DB_STANDBY_POSTGRES_DOWN
expr: avg_over_time(pg_up{instance=~".*(standby|offline).*pg"}[2m]) < 0.3
for: 2m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} DB Postgres Standby Down: {{ $value }}"
description: "{{$labels.instance}} DB Postgres Standby Down: {{ $value }}"
- alert: DB_CPU_USAGE_HIGH
expr: 1 - avg(rate(node_cpu_seconds_total{mode="idle",instance=~".*pg.*"}[1m])) BY (instance) > 0.5
for: 2m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} CPU Usage High: {{ $value }}"
description: "{{$labels.instance}} CPU All Core Usage Higher than 50% percent: {{ $value }}"
- alert: DB_RAM_USAGE_HIGH
expr: (node_memory_Buffers_bytes{instance=~".*pg.*"} + node_memory_MemFree_bytes{instance=~".*pg.*"} + node_memory_Cached_bytes{instance=~".*pg.*"}) / node_memory_MemTotal_bytes{instance=~".*pg.*"} < 0.1 and rate(node_memory_SwapFree_bytes{instance=~".*pg.*"}[1m]) >= 0
for: 2m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} RAM Usage High: {{ $value }}"
description: "{{$labels.instance}} RAM Usage Higher than 90 Percent: {{ $value }}"
- alert: DB_DISK_USAGE_HIGH
expr: (node_filesystem_free_bytes{instance=~".*pg.*",device="/dev/sda1"} / node_filesystem_size_bytes{instance=~".*pg.*",device="/dev/sda1"}) < 0.1
for: 2m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} Disk Free Space Less Than 10 Percent: {{ $value }}"
description: "{{$labels.instance}} Disk {{$labels.device}} {{$labels.fstype}} {{$labels.mountpoint}} Free Space Less Than 10 Percent: {{ $value }}"
- alert: DB_REPLCATION_LAG_HIGH
expr: pg_replication_flush_diff{instance=~".*pg.*", application_name!~"pg_receivewal"} > 102400000
for: 10m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} to {{$labels.application_name}} Replication Lag: {{ $value }}"
description: "{{$labels.instance}} to {{$labels.application_name}} Replication Lag {{ $value }}"
- alert: DB_AGE_HIGH
expr: max(pg_database_age{instance=~".*pg.*"}) by (instance) > 600000000
for: 10m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} DB_AGE_HIGH: {{ $value }}"
description: "{{$labels.instance}} DB_AGE_HIGH: {{ $value }}"
- alert: DB_ACTIVE_SESSION_HIGH
expr: pg_activity_state_count{instance=~".*pg",state="active",datname!~'(postgres|template.)'} > 30
for: 2m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} DB_ACTIVE_SESSION_HIGH Active connection: {{ $value }}"
description: "{{$labels.instance}} DB_ACTIVE_SESSION_HIGH Active connection: {{ $value }}"
- alert: DB_IDLE_IN_XACT_SESSION_HIGH
expr: pg_activity_state_count{instance=~".*pg",state="idle in transaction",datname!~'(postgres|template.)'} > 5
for: 1m
labels:
team: DBA
urgency: P1
annotations:
summary: "{{$labels.instance}} DB_IDLE_IN_XACT_SESSION_HIGH IdleInXact connection: {{ $value }}"
description: "{{$labels.instance}} DB_IDLE_IN_XACT_SESSION_HIGH IdleInXact connection: {{ $value }}"
- alert: DB_CPU_USAGE_WARN
expr: 1 - avg(rate(node_cpu_seconds_total{mode="idle",instance=~".*pg.*"}[1m])) BY (instance) > 0.3
for: 2m
labels:
team: DBA
urgency: P2
annotations:
summary: "{{$labels.instance}} CPU Usage Warning: {{ $value }}"
description: "{{$labels.instance}} CPU Usage Warning: {{ $value }}"
- alert: DB_CPU_ONE_CORE_HIGH
expr: min(rate(node_cpu_seconds_total{mode="idle",instance=~".*pg.*"}[1m])) BY (instance, cpu) < 0.1
for: 2m
labels:
team: DBA
urgency: P2
annotations:
summary: "{{$labels.instance}} DB_CPU_ONE_CORE_HIGH {{$labels.cpu}}: {{ $value }}"
description: "{{$labels.instance}} DB_CPU_ONE_CORE_HIGH {{$labels.cpu}}: {{ $value }}"
- alert: DB_RAM_USAGE_WARN
expr: (node_memory_Buffers_bytes{instance=~".*pg.*"} + node_memory_MemFree_bytes{instance=~".*pg.*"} + node_memory_Cached_bytes{instance=~".*pg.*"}) / node_memory_MemTotal_bytes{instance=~".*pg.*"} < 0.2
for: 2m
labels:
team: DBA
urgency: P2
annotations:
summary: "{{$labels.instance}} DB_RAM_USAGE_WARN: {{ $value }}"
description: "{{$labels.instance}} RAM Usage Higher than 90 Percent: {{ $value }}"
- alert: DB_DISK_USAGE_WARN
expr: (node_filesystem_free_bytes{instance=~".*pg.*",device="/dev/sda1"} / node_filesystem_size_bytes{instance=~".*pg.*",device="/dev/sda1"}) < 0.25
for: 2m
labels:
team: DBA
urgency: P2
annotations:
summary: "{{$labels.instance}} Disk Free Space Less Than 25 Percent: {{ $value }}"
description: "{{$labels.instance}} Disk {{$labels.device}} {{$labels.fstype}} {{$labels.mountpoint}} Free Space Less Than 10 Percent: {{ $value }}"
- alert: DB_REPLCATION_LAG_WARN
expr: pg_replication_flush_diff{instance=~".*pg.*", application_name!~"pg_receivewal"} > 10240000
for: 10m
labels:
team: DBA
urgency: P2
annotations:
summary: "{{$labels.instance}} DB_REPLCATION_LAG_WARN to {{$labels.application_name}} Replication Lag: {{ $value }}"
description: "{{$labels.instance}} DB_REPLCATION_LAG_WARN to {{$labels.application_name}} Replication Lag {{ $value }}"
- alert: DB_AGE_WARN
expr: max(pg_database_age{instance=~".*pg.*"}) by (instance) > 250000000
for: 10m
labels:
team: DBA
urgency: P2
annotations:
summary: "{{$labels.instance}} DB_AGE_WARN: {{ $value }}"
description: "{{$labels.instance}} DB_AGE_WARN: {{ $value }}"