example: separate cartridge alerts

DifferentialOrange · DifferentialOrange · commit 9464948188a9 · 2024-07-09T13:45:26.000+03:00
These are referenced in documentation as well. Part of #224
diff --git a/Makefile b/Makefile
@@ -33,6 +33,7 @@ test-deps: build-deps
 run-tests:
 	./tests.sh
 	./promtool test rules example_cluster/prometheus/test_alerts.yml
+	./promtool test rules example_cluster/prometheus/test_cartridge_alerts.yml
 
 .PHONY: update-tests
 update-tests:
diff --git a/docker-compose.cartridge.yml b/docker-compose.cartridge.yml
@@ -55,7 +55,7 @@ services:
       - 9090:9090
     volumes:
       - ./example_cluster/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
-      - ./example_cluster/prometheus/alerts.yml:/etc/prometheus/alerts.yml
+      - ./example_cluster/prometheus/alerts.yml:/etc/prometheus/cartridge_alerts.yml
 
   grafana:
     image: grafana/grafana:8.1.3
diff --git a/example_cluster/prometheus/alerts.yml b/example_cluster/prometheus/alerts.yml
@@ -82,31 +82,6 @@ groups:
         You are likely to hit limit soon.
         It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
 
-   # Warning for Cartridge warning issues.
-  - alert: CartridgeWarningIssues
-    expr: tnt_cartridge_issues{level="warning"} > 0
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'warning'-level Cartridge issues"
-      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'warning'-level Cartridge issues.
-        Possible reasons: high replication lag, replication long idle,
-        failover or switchover issues, clock issues, memory fragmentation,
-        configuration issues, alien members."
-
-  # Alert for Cartridge critical issues.
-  - alert: CartridgeCriticalIssues
-    expr: tnt_cartridge_issues{level="critical"} > 0
-    for: 1m
-    labels:
-      severity: page
-    annotations:
-      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'critical'-level Cartridge issues"
-      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'critical'-level Cartridge issues.
-        Possible reasons: replication process critical fail,
-        running out of available memory."
-
   # Alert for Tarantool replication high lag (both for masters and replicas).
   - alert: HighReplicationLag
     expr: tnt_replication_lag > 1
diff --git a/example_cluster/prometheus/cartridge_alerts.yml b/example_cluster/prometheus/cartridge_alerts.yml
@@ -0,0 +1,285 @@
+groups:
+- name: common
+  rules:
+  # Alert for any instance that is unreachable by Prometheus for more than a minute.
+  - alert: InstanceDown
+    expr: up == 0
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance '{{ $labels.instance }}' ('{{ $labels.job }}') down"
+      description: "'{{ $labels.instance }}' of job '{{ $labels.job }}' has been down for more than a minute."
+
+
+- name: tarantool-common
+  rules:
+  # Warning for any instance that uses too much Lua runtime memory.
+  - alert: HighLuaMemoryWarning
+    expr: tnt_info_memory_lua >= (512 * 1024 * 1024)
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime warning"
+      description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory
+        and may hit threshold soon."
+
+  # Alert for any instance that uses too much Lua runtime memory.
+  - alert: HighLuaMemory
+    expr: tnt_info_memory_lua >= (1024 * 1024 * 1024)
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime alert"
+      description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory
+        and likely to hit threshold soon."
+
+  # Warning for any instance that have low remaining arena memory.
+  - alert: LowMemtxArenaRemainingWarning
+    expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_arena_used_ratio >= 80)
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining"
+      description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
+        Consider increasing memtx_memory or number of storages in case of sharded data."
+
+  # Alert for any instance that have low remaining arena memory.
+  - alert: LowMemtxArenaRemaining
+    expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_arena_used_ratio >= 90)
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining"
+      description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
+        You are likely to hit limit soon.
+        It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
+
+  # Warning for any instance that have low remaining items memory.
+  - alert: LowMemtxItemsRemainingWarning
+    expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_items_used_ratio >= 80)
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining"
+      description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
+        Consider increasing memtx_memory or number of storages in case of sharded data."
+
+  # Alert for any instance that have low remaining arena memory.
+  - alert: LowMemtxItemsRemaining
+    expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_items_used_ratio >= 90)
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining"
+      description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
+        You are likely to hit limit soon.
+        It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
+
+   # Warning for Cartridge warning issues.
+  - alert: CartridgeWarningIssues
+    expr: tnt_cartridge_issues{level="warning"} > 0
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'warning'-level Cartridge issues"
+      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'warning'-level Cartridge issues.
+        Possible reasons: high replication lag, replication long idle,
+        failover or switchover issues, clock issues, memory fragmentation,
+        configuration issues, alien members."
+
+  # Alert for Cartridge critical issues.
+  - alert: CartridgeCriticalIssues
+    expr: tnt_cartridge_issues{level="critical"} > 0
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has 'critical'-level Cartridge issues"
+      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has 'critical'-level Cartridge issues.
+        Possible reasons: replication process critical fail,
+        running out of available memory."
+
+  # Alert for Tarantool replication high lag (both for masters and replicas).
+  - alert: HighReplicationLag
+    expr: tnt_replication_lag > 1
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high replication lag (id {{ $labels.id }})"
+      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have high replication lag
+        (id {{ $labels.id }}), check up your network and cluster state."
+
+  # Alert for Tarantool low vinyl engine regulator rate limit.
+  - alert: LowVinylRegulatorRateLimit
+    expr: tnt_vinyl_regulator_rate_limit < 100000
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have low vinyl regulator rate limit"
+      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have low vinyl engine regulator rate limit.
+        This indicates issues with the disk or the scheduler."
+
+  # Alert for Tarantool high vinyl transactions conflict rate.
+  - alert: HighVinylTxConflictRate
+    expr: rate(tnt_vinyl_tx_conflict[5m]) / rate(tnt_vinyl_tx_commit[5m]) > 0.05
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high vinyl tx conflict rate"
+      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have
+        high vinyl transactions conflict rate. It indicates that vinyl is not healthy."
+
+  # Alert for Tarantool high vinyl scheduler failed tasks rate.
+  - alert: HighVinylSchedulerFailedTasksRate
+    expr: rate(tnt_vinyl_scheduler_tasks{status="failed"}[5m]) > 0.1
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high vinyl scheduler failed tasks rate"
+      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have
+        high vinyl scheduler failed tasks rate."
+
+  # Alert for high duration of event loop iteration in Tarantool.
+  - alert: HighEVLoopTime
+    expr: tnt_ev_loop_time > 0.1
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') event loop has high cycle duration"
+      description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' event loop has high cycle duration.
+        Some high loaded fiber has too little yields. It may be the reason of 'Too long WAL write' warnings."
+
+  # Alert for Tarantool replication not running.
+  - alert: ReplicationNotRunning
+    expr: tnt_replication_status == 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') {{ $labels.stream }} (id {{ $labels.id }})
+        replication is not running"
+      description: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') {{ $labels.stream }} (id {{ $labels.id }})
+        replication is not running. Check Cartridge UI for details."
+
+
+- name: tarantool-crud
+  rules:
+  # Alert for CRUD module request errors.
+  - alert: HighCRUDErrorRate
+    expr: rate(tnt_crud_stats_count{ job="tarantool", status="error" }[5m]) > 0.1
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too many CRUD {{ $labels.operation }} errors."
+      description: "Too many {{ $labels.operation }} CRUD requests for '{{ $labels.name }}' space on
+        '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get module error responses."
+
+  # Warning for CRUD module requests too long responses.
+  - alert: HighCRUDLatency
+    expr: tnt_crud_stats{ job="tarantool", quantile="0.99" } > 0.1
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too high CRUD {{ $labels.operation }} latency."
+      description: "Some {{ $labels.operation }} {{ $labels.status }} CRUD requests for '{{ $labels.name }}' space on
+        '{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long."
+
+  # Warning for too many map reduce CRUD module requests.
+  - alert: HighCRUDMapReduceRate
+    expr: rate(tnt_crud_map_reduces{ job="tarantool" }[5m]) > 0.1
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too many CRUD {{ $labels.operation }} map reduces."
+      description: "There are too many {{ $labels.operation }} CRUD map reduce requests for '{{ $labels.name }}' space on
+        '{{ $labels.alias }}' instance of job '{{ $labels.job }}'.
+        Check your request conditions or consider changing sharding schema."
+
+
+- name: tarantool-business
+  rules:
+  # Warning for any endpoint of an instance in tarantool job that responds too long.
+  # Beware that metric name depends on name of the collector you use in HTTP metrics middleware
+  # and request depends on type of this collector.
+  # This example based on summary collector with default name.
+  - alert: HighHTTPLatency
+    expr: http_server_request_latency{ job="tarantool", quantile="0.99" } > 0.1
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high HTTP latency"
+      description: "Some {{ $labels.method }} requests to {{ $labels.path }} path with {{ $labels.status }} response status
+        on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long."
+
+  # Alert for any endpoint of an instance in tarantool job that sends too much 4xx responses.
+  # Beware that metric name depends on name of the collector you use in HTTP metrics middleware
+  # and request depends on type of this collector.
+  # This example based on summary collector with default name.
+  - alert: HighInstanceHTTPClientErrorRate
+    expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 10
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high rate of client error responses"
+      description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path 
+        on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get client error (4xx) responses."
+
+  # Alert for any endpoint in tarantool job that sends too much 4xx responses (cluster overall).
+  # Beware that metric name depends on name of the collector you use in HTTP metrics middleware
+  # and request depends on type of this collector.
+  # This example based on summary collector with default name.
+  - alert: HighHTTPClientErrorRate
+    expr: sum by (job, method, path) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 20
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Job '{{ $labels.job }}' high rate of client error responses"
+      description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path
+        on instances of job '{{ $labels.job }}' get client error (4xx) responses."
+
+  # Alert for any endpoint of an instance in tarantool job that sends 5xx responses.
+  # Beware that metric name depends on name of the collector you use in HTTP metrics middleware
+  # and request depends on type of this collector.
+  # This example based on summary collector with default name.
+  - alert: HighHTTPServerErrorRate
+    expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^5\\d{2}$" }[5m])) > 0
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') server error responses"
+      description: "Some {{ $labels.method }} requests to {{ $labels.path }} path 
+        on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get server error (5xx) responses."
+
+  # Warning for any endpoint of a router instance (with "router" in alias) in tarantool job that gets too little requests.
+  # Beware that metric name depends on name of the collector you use in HTTP metrics middleware
+  # and request depends on type of this collector.
+  # This example based on summary collector with default name.
+  - alert: LowRouterHTTPRequestRate
+    expr: sum by (job, instance, alias) (rate(http_server_request_latency_count{ job="tarantool", alias=~"^.*router.*$" }[5m])) < 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Router '{{ $labels.alias }}' ('{{ $labels.job }}') low activity"
+      description: "Router '{{ $labels.alias }}' instance of job '{{ $labels.job }}' gets too little requests.
+        Please, check up your balancer middleware."
diff --git a/example_cluster/prometheus/test_alerts.yml b/example_cluster/prometheus/test_alerts.yml
diff --git a/example_cluster/prometheus/test_cartridge_alerts.yml b/example_cluster/prometheus/test_cartridge_alerts.yml