Skip to content

Commit 9464948

Browse files
example: separate cartridge alerts
These are referenced in documentation as well. Part of #224
1 parent ffd66fd commit 9464948

File tree

6 files changed

+908
-91
lines changed

6 files changed

+908
-91
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ test-deps: build-deps
3333
run-tests:
3434
./tests.sh
3535
./promtool test rules example_cluster/prometheus/test_alerts.yml
36+
./promtool test rules example_cluster/prometheus/test_cartridge_alerts.yml
3637

3738
.PHONY: update-tests
3839
update-tests:

docker-compose.cartridge.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ services:
5555
- 9090:9090
5656
volumes:
5757
- ./example_cluster/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
58-
- ./example_cluster/prometheus/alerts.yml:/etc/prometheus/alerts.yml
58+
- ./example_cluster/prometheus/alerts.yml:/etc/prometheus/cartridge_alerts.yml
5959

6060
grafana:
6161
image: grafana/grafana:8.1.3

example_cluster/prometheus/alerts.yml

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -82,31 +82,6 @@ groups:
8282
You are likely to hit limit soon.
8383
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
8484

85-
# Warning for Cartridge warning issues.
86-
- alert: CartridgeWarningIssues
87-
expr: tnt_cartridge_issues{level="warning"} > 0
88-
for: 1m
89-
labels:
90-
severity: warning
91-
annotations:
92-
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'warning'-level Cartridge issues"
93-
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'warning'-level Cartridge issues.
94-
Possible reasons: high replication lag, replication long idle,
95-
failover or switchover issues, clock issues, memory fragmentation,
96-
configuration issues, alien members."
97-
98-
# Alert for Cartridge critical issues.
99-
- alert: CartridgeCriticalIssues
100-
expr: tnt_cartridge_issues{level="critical"} > 0
101-
for: 1m
102-
labels:
103-
severity: page
104-
annotations:
105-
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'critical'-level Cartridge issues"
106-
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'critical'-level Cartridge issues.
107-
Possible reasons: replication process critical fail,
108-
running out of available memory."
109-
11085
# Alert for Tarantool replication high lag (both for masters and replicas).
11186
- alert: HighReplicationLag
11287
expr: tnt_replication_lag > 1
Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
groups:
2+
- name: common
3+
rules:
4+
# Alert for any instance that is unreachable by Prometheus for more than a minute.
5+
- alert: InstanceDown
6+
expr: up == 0
7+
for: 1m
8+
labels:
9+
severity: page
10+
annotations:
11+
summary: "Instance '{{ $labels.instance }}' ('{{ $labels.job }}') down"
12+
description: "'{{ $labels.instance }}' of job '{{ $labels.job }}' has been down for more than a minute."
13+
14+
15+
- name: tarantool-common
16+
rules:
17+
# Warning for any instance that uses too much Lua runtime memory.
18+
- alert: HighLuaMemoryWarning
19+
expr: tnt_info_memory_lua >= (512 * 1024 * 1024)
20+
for: 1m
21+
labels:
22+
severity: warning
23+
annotations:
24+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime warning"
25+
description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory
26+
and may hit threshold soon."
27+
28+
# Alert for any instance that uses too much Lua runtime memory.
29+
- alert: HighLuaMemory
30+
expr: tnt_info_memory_lua >= (1024 * 1024 * 1024)
31+
for: 1m
32+
labels:
33+
severity: page
34+
annotations:
35+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime alert"
36+
description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory
37+
and likely to hit threshold soon."
38+
39+
# Warning for any instance that have low remaining arena memory.
40+
- alert: LowMemtxArenaRemainingWarning
41+
expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_arena_used_ratio >= 80)
42+
for: 1m
43+
labels:
44+
severity: warning
45+
annotations:
46+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining"
47+
description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
48+
Consider increasing memtx_memory or number of storages in case of sharded data."
49+
50+
# Alert for any instance that have low remaining arena memory.
51+
- alert: LowMemtxArenaRemaining
52+
expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_arena_used_ratio >= 90)
53+
for: 1m
54+
labels:
55+
severity: page
56+
annotations:
57+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining"
58+
description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
59+
You are likely to hit limit soon.
60+
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
61+
62+
# Warning for any instance that have low remaining items memory.
63+
- alert: LowMemtxItemsRemainingWarning
64+
expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_items_used_ratio >= 80)
65+
for: 1m
66+
labels:
67+
severity: warning
68+
annotations:
69+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining"
70+
description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
71+
Consider increasing memtx_memory or number of storages in case of sharded data."
72+
73+
# Alert for any instance that have low remaining arena memory.
74+
- alert: LowMemtxItemsRemaining
75+
expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_items_used_ratio >= 90)
76+
for: 1m
77+
labels:
78+
severity: page
79+
annotations:
80+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining"
81+
description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
82+
You are likely to hit limit soon.
83+
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
84+
85+
# Warning for Cartridge warning issues.
86+
- alert: CartridgeWarningIssues
87+
expr: tnt_cartridge_issues{level="warning"} > 0
88+
for: 1m
89+
labels:
90+
severity: warning
91+
annotations:
92+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'warning'-level Cartridge issues"
93+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'warning'-level Cartridge issues.
94+
Possible reasons: high replication lag, replication long idle,
95+
failover or switchover issues, clock issues, memory fragmentation,
96+
configuration issues, alien members."
97+
98+
# Alert for Cartridge critical issues.
99+
- alert: CartridgeCriticalIssues
100+
expr: tnt_cartridge_issues{level="critical"} > 0
101+
for: 1m
102+
labels:
103+
severity: page
104+
annotations:
105+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'critical'-level Cartridge issues"
106+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'critical'-level Cartridge issues.
107+
Possible reasons: replication process critical fail,
108+
running out of available memory."
109+
110+
# Alert for Tarantool replication high lag (both for masters and replicas).
111+
- alert: HighReplicationLag
112+
expr: tnt_replication_lag > 1
113+
for: 1m
114+
labels:
115+
severity: warning
116+
annotations:
117+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high replication lag (id {{ $labels.id }})"
118+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have high replication lag
119+
(id {{ $labels.id }}), check up your network and cluster state."
120+
121+
# Alert for Tarantool low vinyl engine regulator rate limit.
122+
- alert: LowVinylRegulatorRateLimit
123+
expr: tnt_vinyl_regulator_rate_limit < 100000
124+
for: 1m
125+
labels:
126+
severity: warning
127+
annotations:
128+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have low vinyl regulator rate limit"
129+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have low vinyl engine regulator rate limit.
130+
This indicates issues with the disk or the scheduler."
131+
132+
# Alert for Tarantool high vinyl transactions conflict rate.
133+
- alert: HighVinylTxConflictRate
134+
expr: rate(tnt_vinyl_tx_conflict[5m]) / rate(tnt_vinyl_tx_commit[5m]) > 0.05
135+
for: 1m
136+
labels:
137+
severity: critical
138+
annotations:
139+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high vinyl tx conflict rate"
140+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have
141+
high vinyl transactions conflict rate. It indicates that vinyl is not healthy."
142+
143+
# Alert for Tarantool high vinyl scheduler failed tasks rate.
144+
- alert: HighVinylSchedulerFailedTasksRate
145+
expr: rate(tnt_vinyl_scheduler_tasks{status="failed"}[5m]) > 0.1
146+
for: 1m
147+
labels:
148+
severity: critical
149+
annotations:
150+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high vinyl scheduler failed tasks rate"
151+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have
152+
high vinyl scheduler failed tasks rate."
153+
154+
# Alert for high duration of event loop iteration in Tarantool.
155+
- alert: HighEVLoopTime
156+
expr: tnt_ev_loop_time > 0.1
157+
for: 1m
158+
labels:
159+
severity: warning
160+
annotations:
161+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') event loop has high cycle duration"
162+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' event loop has high cycle duration.
163+
Some high loaded fiber has too little yields. It may be the reason of 'Too long WAL write' warnings."
164+
165+
# Alert for Tarantool replication not running.
166+
- alert: ReplicationNotRunning
167+
expr: tnt_replication_status == 0
168+
for: 1m
169+
labels:
170+
severity: critical
171+
annotations:
172+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') {{ $labels.stream }} (id {{ $labels.id }})
173+
replication is not running"
174+
description: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') {{ $labels.stream }} (id {{ $labels.id }})
175+
replication is not running. Check Cartridge UI for details."
176+
177+
178+
- name: tarantool-crud
179+
rules:
180+
# Alert for CRUD module request errors.
181+
- alert: HighCRUDErrorRate
182+
expr: rate(tnt_crud_stats_count{ job="tarantool", status="error" }[5m]) > 0.1
183+
for: 1m
184+
labels:
185+
severity: critical
186+
annotations:
187+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too many CRUD {{ $labels.operation }} errors."
188+
description: "Too many {{ $labels.operation }} CRUD requests for '{{ $labels.name }}' space on
189+
'{{ $labels.alias }}' instance of job '{{ $labels.job }}' get module error responses."
190+
191+
# Warning for CRUD module requests too long responses.
192+
- alert: HighCRUDLatency
193+
expr: tnt_crud_stats{ job="tarantool", quantile="0.99" } > 0.1
194+
for: 1m
195+
labels:
196+
severity: warning
197+
annotations:
198+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too high CRUD {{ $labels.operation }} latency."
199+
description: "Some {{ $labels.operation }} {{ $labels.status }} CRUD requests for '{{ $labels.name }}' space on
200+
'{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long."
201+
202+
# Warning for too many map reduce CRUD module requests.
203+
- alert: HighCRUDMapReduceRate
204+
expr: rate(tnt_crud_map_reduces{ job="tarantool" }[5m]) > 0.1
205+
for: 1m
206+
labels:
207+
severity: warning
208+
annotations:
209+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too many CRUD {{ $labels.operation }} map reduces."
210+
description: "There are too many {{ $labels.operation }} CRUD map reduce requests for '{{ $labels.name }}' space on
211+
'{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
212+
Check your request conditions or consider changing sharding schema."
213+
214+
215+
- name: tarantool-business
216+
rules:
217+
# Warning for any endpoint of an instance in tarantool job that responds too long.
218+
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
219+
# and request depends on type of this collector.
220+
# This example based on summary collector with default name.
221+
- alert: HighHTTPLatency
222+
expr: http_server_request_latency{ job="tarantool", quantile="0.99" } > 0.1
223+
for: 5m
224+
labels:
225+
severity: warning
226+
annotations:
227+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high HTTP latency"
228+
description: "Some {{ $labels.method }} requests to {{ $labels.path }} path with {{ $labels.status }} response status
229+
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long."
230+
231+
# Alert for any endpoint of an instance in tarantool job that sends too much 4xx responses.
232+
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
233+
# and request depends on type of this collector.
234+
# This example based on summary collector with default name.
235+
- alert: HighInstanceHTTPClientErrorRate
236+
expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 10
237+
for: 1m
238+
labels:
239+
severity: page
240+
annotations:
241+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high rate of client error responses"
242+
description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path
243+
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get client error (4xx) responses."
244+
245+
# Alert for any endpoint in tarantool job that sends too much 4xx responses (cluster overall).
246+
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
247+
# and request depends on type of this collector.
248+
# This example based on summary collector with default name.
249+
- alert: HighHTTPClientErrorRate
250+
expr: sum by (job, method, path) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 20
251+
for: 1m
252+
labels:
253+
severity: page
254+
annotations:
255+
summary: "Job '{{ $labels.job }}' high rate of client error responses"
256+
description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path
257+
on instances of job '{{ $labels.job }}' get client error (4xx) responses."
258+
259+
# Alert for any endpoint of an instance in tarantool job that sends 5xx responses.
260+
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
261+
# and request depends on type of this collector.
262+
# This example based on summary collector with default name.
263+
- alert: HighHTTPServerErrorRate
264+
expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^5\\d{2}$" }[5m])) > 0
265+
for: 1m
266+
labels:
267+
severity: page
268+
annotations:
269+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') server error responses"
270+
description: "Some {{ $labels.method }} requests to {{ $labels.path }} path
271+
on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get server error (5xx) responses."
272+
273+
# Warning for any endpoint of a router instance (with "router" in alias) in tarantool job that gets too little requests.
274+
# Beware that metric name depends on name of the collector you use in HTTP metrics middleware
275+
# and request depends on type of this collector.
276+
# This example based on summary collector with default name.
277+
- alert: LowRouterHTTPRequestRate
278+
expr: sum by (job, instance, alias) (rate(http_server_request_latency_count{ job="tarantool", alias=~"^.*router.*$" }[5m])) < 10
279+
for: 5m
280+
labels:
281+
severity: warning
282+
annotations:
283+
summary: "Router '{{ $labels.alias }}' ('{{ $labels.job }}') low activity"
284+
description: "Router '{{ $labels.alias }}' instance of job '{{ $labels.job }}' gets too little requests.
285+
Please, check up your balancer middleware."

0 commit comments

Comments
 (0)