From fb447101ec4231eed0c9bb409553b9a767cd572d Mon Sep 17 00:00:00 2001 From: Piotr Gwizdala <17101802+thampiotr@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:30:41 +0100 Subject: [PATCH 1/2] Add cluster peers per instance panel to cluster overview dash --- CHANGELOG.md | 4 +++ .../dashboards/cluster-overview.libsonnet | 25 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 08caacbfc6..64bf66306c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,10 @@ Main (unreleased) used as a temporary measure, since this flag will be disabled in future releases. (@thampiotr) +- Added a new panel to Cluster Overview dashboard to show the number of peers + seen by each instance in the cluster. This can help diagnose cluster split + brain issues. (@thampiotr) + ### Bugfixes - Fixed an issue which caused loss of context data in Faro exception. (@codecapitano) diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index 06ad02b552..da841c6da3 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -225,5 +225,30 @@ local cluster_node_filename = 'alloy-cluster-node.json'; }, ]) ), + + // Number of peers as seen by each instance. + ( + panel.new(title='Number of peers seen by each instance', type='timeseries') + + panel.withUnit('instances') + + panel.withDescription(||| + The number of cluster peers seen by each instance. + + When cluster is converged, every peer should see all the other instances. When we have a split brain or one + peer not joining the cluster, we will see two or more groups of instances that report different peer numbers + for an extended period of time and not converging. + + This graph helps to identify which instances may be in a split brain state. + |||) + + panel.withPosition({ h: 12, w: 24, x: 0, y: 18 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (cluster_node_peers{%(groupSelector)s}) + ||| % $._config, + legendFormat='{{instance}}', + ), + ]) + ), + ]), } From 2ceea2b1eac8a62f77bf4b7ab0f236ee84d2dc21 Mon Sep 17 00:00:00 2001 From: Piotr Gwizdala <17101802+thampiotr@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:53:45 +0100 Subject: [PATCH 2/2] Rename instances->peers on the panel --- operations/alloy-mixin/dashboards/cluster-overview.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index da841c6da3..d5e4ff3fd1 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -229,7 +229,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; // Number of peers as seen by each instance. ( panel.new(title='Number of peers seen by each instance', type='timeseries') + - panel.withUnit('instances') + + panel.withUnit('peers') + panel.withDescription(||| The number of cluster peers seen by each instance.