Skip to content

Commit 0f2bc7b

Browse files
craig[bot]angeladietz
andcommitted
Merge #158334
158334: roachtest: add comments explaining allocbench metrics r=angeladietz a=angeladietz This adds some comments to document each of the metrics reported by the allocation benchmarking tests, how they are computed, and how to interpret them. Informs #158203 Epic: CRDB-55052 Release note: None Co-authored-by: Angela Dietz <[email protected]>
2 parents 60a9c9e + 558bd68 commit 0f2bc7b

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

pkg/cmd/roachtest/tests/allocation_bench.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,18 @@ func (r kvAllocBenchEventRunner) run(ctx context.Context, c cluster.Cluster, t t
172172
t.Status("running kv workload", runCmd)
173173
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), runCmd)
174174
}
175+
176+
// registerAllocationBench registers allocation benchmark tests that measure load
177+
// balancing quality across the cluster under various workload patterns. The tests
178+
// run multiple samples (iterations of a scenario) and report metrics for CPU
179+
// imbalance, write I/O imbalance, and rebalancing cost.
180+
//
181+
// - ops=skew: Mixes different operation types with different rates (e.g., high-rate
182+
// small operations vs low-rate large operations). Each workload has different
183+
// resource characteristics but similar aggregate resource usage.
184+
// - access=skew: Uses the same operation type but with a skewed (power-law) key
185+
// access distribution, where certain keys are accessed much more frequently
186+
// than others.
175187
func registerAllocationBench(r registry.Registry) {
176188
specTemplates := []allocationBenchSpec{
177189
// TODO(kvoli): Add a background event runner and implement events for
@@ -279,6 +291,10 @@ func registerAllocationBenchSpec(r registry.Registry, allocSpec allocationBenchS
279291
})
280292
}
281293

294+
// setupAllocationBench initializes the cluster for an allocation benchmark run.
295+
// It starts the CockroachDB nodes with the specified load-based rebalancing mode,
296+
// sets up Prometheus and Grafana for metrics collection, and returns a stat
297+
// collector and cleanup function.
282298
func setupAllocationBench(
283299
ctx context.Context, t test.Test, c cluster.Cluster, spec allocationBenchSpec,
284300
) (clusterstats.StatCollector, func(context.Context)) {
@@ -339,6 +355,45 @@ func runAllocationBenchEvent(
339355
return load.run.run(ctx, c, t)
340356
}
341357

358+
// runAllocationBench runs the allocation benchmark test. The test measures load
359+
// balancing quality across the cluster by collecting metrics over multiple sample runs.
360+
//
361+
// Metrics are computed by querying Prometheus every 10 seconds (clusterstats/collector.go:
362+
// defaultScrapeInterval), where each data point aggregates over a 5-minute rolling window via
363+
// [5m] range vector selectors (rebalance_stats.go: cpuStat, ioWriteStat). This means each
364+
// 10-second data point represents the average over the previous 5 minutes. After a 10-minute
365+
// warmup, metrics are collected during the workload (default 30 minutes). At each 10-second
366+
// data point, the (max - min) imbalance across nodes is calculated, then all these values are
367+
// averaged to produce the final metric.
368+
//
369+
// Metrics Reported to Roachperf:
370+
//
371+
// 1. cpu(%): Average of (max - min) CPU utilization across nodes per 5-minute interval.
372+
// This measures how well CPU load is balanced. Lower values indicate more evenly
373+
// distributed CPU load. For example, if at an interval nodes have CPU utilizations of
374+
// [30%, 35%, 40%, 25%, 30%], the spread is calculated as 40% - 25% = 15%. This means
375+
// the highest CPU node is 15 percentage points above the lowest on average for that interval.
376+
//
377+
// 2. write(%): Average of (max - min) write disk I/O across nodes per 5-minute
378+
// interval. Write throughput (MB/s) is normalized by 400 MB/s. This measures how
379+
// well write load is balanced. Lower values indicate more evenly distributed I/O.
380+
// For example, if at an interval nodes have write throughput of [200 MB/s, 250 MB/s,
381+
// 300 MB/s, 150 MB/s, 200 MB/s], the spread is 300 - 150 = 150 MB/s. Normalized by
382+
// 400 MB/s, this becomes 150 / 400 = 37.5%.
383+
//
384+
// 3. cost(gb): Total GB of data moved during rebalancing, computed as the difference
385+
// between cumulative rebalance snapshot bytes at the end vs the start of recording.
386+
// Lower values indicate more efficient rebalancing, but some rebalancing is
387+
// necessary to achieve good load distribution. For example, if the cumulative
388+
// rebalance bytes are 10 GB at the start and 15 GB at the end, cost(gb) = 5.
389+
//
390+
// The test runs multiple samples (default: 5) and selects the "middle run" to
391+
// export - the sample with minimum sum of pairwise distances to all other
392+
// samples across all metrics, see findMinDistanceClusterStatRun. Standard
393+
// deviation metrics (std_*) are computed over the values of each metric across
394+
// all sample runs (e.g., std_cpu(%) is the standard deviation of cpu(%) values
395+
// from all 5 runs). These indicate consistency across runs and are indicative
396+
// of worst/best case outcomes.
342397
func runAllocationBench(
343398
ctx context.Context, t test.Test, c cluster.Cluster, spec allocationBenchSpec,
344399
) {
@@ -400,6 +455,10 @@ func runAllocationBench(
400455
}
401456
}
402457

458+
// runAllocationBenchSample runs a single sample of the allocation benchmark.
459+
// It executes the workload events, then collects and aggregates metrics from
460+
// Prometheus for the recording period (after warmup). Returns the collected
461+
// statistics including cpu(%), write(%), and cost(gb) metrics.
403462
func runAllocationBenchSample(
404463
ctx context.Context,
405464
t test.Test,

0 commit comments

Comments
 (0)