Merge pull request redpanda-data#15966 from travisdowns/td-677-15811-metrics-reduction

travisdowns · web-flow · commit 750bbdb8bcf6 · 2024-01-12T10:24:43.000-08:00
Add partition aggregation to some metrics
diff --git a/src/v/cluster/partition_probe.cc b/src/v/cluster/partition_probe.cc
@@ -20,6 +20,9 @@
 
 namespace cluster {
 
+static const ss::sstring cluster_metrics_name
+  = prometheus_sanitize::metrics_name("cluster:partition");
+
 replicated_partition_probe::replicated_partition_probe(
   const partition& p) noexcept
   : _partition(p) {
@@ -59,8 +62,31 @@ void replicated_partition_probe::setup_internal_metrics(const model::ntp& ntp) {
       partition_label(ntp.tp.partition()),
     };
 
+    // The following few metrics uses a separate add_group call which doesn't
+    // aggregate any labels since aggregation does not make sense for "leader
+    // ID" values.
+    _metrics.add_group(
+      cluster_metrics_name,
+      {sm::make_gauge(
+         "leader_id",
+         [this] {
+             return _partition.raft()->get_leader_id().value_or(
+               model::node_id(-1));
+         },
+         sm::description("Id of current partition leader"),
+         labels),
+       sm::make_gauge(
+         "under_replicated_replicas",
+         [this] {
+             return _partition.raft()->get_under_replicated().value_or(0);
+         },
+         sm::description("Number of under replicated replicas"),
+         labels)},
+      {},
+      {sm::shard_label});
+
     _metrics.add_group(
-      prometheus_sanitize::metrics_name("cluster:partition"),
+      cluster_metrics_name,
       {
         sm::make_gauge(
           "leader",
@@ -96,21 +122,6 @@ void replicated_partition_probe::setup_internal_metrics(const model::ntp& ntp) {
           sm::description(
             "Partion high watermark i.e. highest consumable offset"),
           labels),
-        sm::make_gauge(
-          "leader_id",
-          [this] {
-              return _partition.raft()->get_leader_id().value_or(
-                model::node_id(-1));
-          },
-          sm::description("Id of current partition leader"),
-          labels),
-        sm::make_gauge(
-          "under_replicated_replicas",
-          [this] {
-              return _partition.raft()->get_under_replicated().value_or(0);
-          },
-          sm::description("Number of under replicated replicas"),
-          labels),
         sm::make_counter(
           "records_produced",
           [this] { return _records_produced; },
@@ -153,13 +164,13 @@ void replicated_partition_probe::setup_internal_metrics(const model::ntp& ntp) {
           labels),
       },
       {},
-      {sm::shard_label});
+      {sm::shard_label, partition_label});
 
     if (
       config::shard_local_cfg().enable_schema_id_validation()
       != pandaproxy::schema_registry::schema_id_validation_mode::none) {
         _metrics.add_group(
-          prometheus_sanitize::metrics_name("cluster:partition"),
+          cluster_metrics_name,
           {
             sm::make_counter(
               "schema_id_validation_records_failed",
@@ -276,7 +287,7 @@ void replicated_partition_probe::setup_public_metrics(const model::ntp& ntp) {
       config::shard_local_cfg().enable_schema_id_validation()
       != pandaproxy::schema_registry::schema_id_validation_mode::none) {
         _public_metrics.add_group(
-          prometheus_sanitize::metrics_name("cluster:partition"),
+          cluster_metrics_name,
           {
             sm::make_counter(
               "schema_id_validation_records_failed",
diff --git a/src/v/cluster/rm_stm.cc b/src/v/cluster/rm_stm.cc
@@ -2445,7 +2445,7 @@ void rm_stm::setup_metrics() {
           labels),
       },
       {},
-      {sm::shard_label});
+      {sm::shard_label, partition_label});
 }
 
 ss::future<> rm_stm::maybe_log_tx_stats() {
diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc
@@ -44,6 +44,7 @@
 #include <seastar/core/fstream.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/gate.hh>
+#include <seastar/core/metrics.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/util/defer.hh>
 
@@ -185,7 +186,7 @@ void consensus::setup_metrics() {
           labels),
       },
       {},
-      {sm::shard_label});
+      {sm::shard_label, sm::label("partition")});
 }
 
 void consensus::setup_public_metrics() {
diff --git a/src/v/storage/probe.cc b/src/v/storage/probe.cc
@@ -159,6 +159,8 @@ void probe::setup_metrics(const model::ntp& ntp) {
     _metrics.add_group(
       group_name,
       {
+        // compaction_ratio cannot easily be aggregated since aggregation always
+        // sums values and sum is nonsensical for a compaction ratio
         sm::make_total_bytes(
           "compaction_ratio",
           [this] { return _compaction_ratio; },

Original file line number	Diff line number	Diff line change
`@@ -2445,7 +2445,7 @@ void rm_stm::setup_metrics() {`
`2445`	`2445`	`labels),`
`2446`	`2446`	`},`
`2447`	`2447`	`{},`
`2448`		`- {sm::shard_label});`
	`2448`	`+ {sm::shard_label, partition_label});`
`2449`	`2449`	`}`
`2450`	`2450`
`2451`	`2451`	`ss::future<> rm_stm::maybe_log_tx_stats() {`
Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,8 @@ void probe::setup_metrics(const model::ntp& ntp) {`
`159`	`159`	`_metrics.add_group(`
`160`	`160`	`group_name,`
`161`	`161`	`{`
	`162`	`+ // compaction_ratio cannot easily be aggregated since aggregation always`
	`163`	`+ // sums values and sum is nonsensical for a compaction ratio`
`162`	`164`	`sm::make_total_bytes(`
`163`	`165`	`"compaction_ratio",`
`164`	`166`	`[this] { return _compaction_ratio; },`